Skip to content

Commit

Permalink
Resubmit r325107 (case folding DJB hash)
Browse files Browse the repository at this point in the history
The issue was that the has function was generating different results depending
on the signedness of char on the host platform. This commit fixes the issue by
explicitly using an unsigned char type to prevent sign extension and
adds some extra tests.

The original commit message was:

This patch implements a variant of the DJB hash function which folds the
input according to the algorithm in the Dwarf 5 specification (Section
6.1.1.4.5), which in turn references the Unicode Standard (Section 5.18,
"Case Mappings").

To achieve this, I have added a llvm::sys::unicode::foldCharSimple
function, which performs this mapping. The implementation of this
function was generated from the CaseMatching.txt file from the Unicode
spec using a python script (which is also included in this patch). The
script tries to optimize the function by coalescing adjecant mappings
with the same shift and stride (terms I made up). Theoretically, it
could be made a bit smarter and merge adjecant blocks that were
interrupted by only one or two characters with exceptional mapping, but
this would save only a couple of branches, while it would greatly
complicate the implementation, so I deemed it was not worth it.

Since we assume that the vast majority of the input characters will be
US-ASCII, the folding hash function has a fast-path for handling these,
and only whips out the full decode+fold+encode logic if we encounter a
character outside of this range. It might be possible to implement the
folding directly on utf8 sequences, but this would also bring a lot of
complexity for the few cases where we will actually need to process
non-ascii characters.

Reviewers: JDevlieghere, aprantl, probinson, dblaikie

Subscribers: mgorny, hintonda, echristo, clayborg, vleschuk, llvm-commits

Differential Revision: https://reviews.llvm.org/D42740

llvm-svn: 325732
  • Loading branch information
labath committed Feb 21, 2018
1 parent ba7a1f0 commit 3b17b84
Show file tree
Hide file tree
Showing 8 changed files with 1,063 additions and 2 deletions.
4 changes: 4 additions & 0 deletions llvm/include/llvm/Support/DJB.h
Expand Up @@ -20,6 +20,10 @@ namespace llvm {

/// The Bernstein hash function used by the DWARF accelerator tables.
uint32_t djbHash(StringRef Buffer, uint32_t H = 5381);

/// Computes the Bernstein hash after folding the input according to the Dwarf 5
/// standard case folding rules.
uint32_t caseFoldingDjbHash(StringRef Buffer, uint32_t H = 5381);
} // namespace llvm

#endif // LLVM_SUPPORT_DJB_H
4 changes: 4 additions & 0 deletions llvm/include/llvm/Support/Unicode.h
Expand Up @@ -60,6 +60,10 @@ bool isPrintable(int UCS);
/// * 1 for each of the remaining characters.
int columnWidthUTF8(StringRef Text);

/// Fold input unicode character according the the Simple unicode case folding
/// rules.
int foldCharSimple(int C);

} // namespace unicode
} // namespace sys
} // namespace llvm
Expand Down
1 change: 1 addition & 0 deletions llvm/lib/Support/CMakeLists.txt
Expand Up @@ -113,6 +113,7 @@ add_llvm_library(LLVMSupport
Triple.cpp
Twine.cpp
Unicode.cpp
UnicodeCaseFold.cpp
YAMLParser.cpp
YAMLTraits.cpp
raw_os_ostream.cpp
Expand Down
80 changes: 78 additions & 2 deletions llvm/lib/Support/DJB.cpp
Expand Up @@ -12,9 +12,85 @@
//===----------------------------------------------------------------------===//

#include "llvm/Support/DJB.h"
#include "llvm/ADT/ArrayRef.h"
#include "llvm/Support/Compiler.h"
#include "llvm/Support/ConvertUTF.h"
#include "llvm/Support/Unicode.h"

using namespace llvm;

static inline uint32_t djbHashChar(unsigned char C, uint32_t H) {
return (H << 5) + H + C;
}

uint32_t llvm::djbHash(StringRef Buffer, uint32_t H) {
for (char C : Buffer.bytes())
H = ((H << 5) + H) + C;
for (unsigned char C : Buffer.bytes())
H = djbHashChar(C, H);
return H;
}

static UTF32 chopOneUTF32(StringRef &Buffer) {
UTF32 C;
const UTF8 *const Begin8Const =
reinterpret_cast<const UTF8 *>(Buffer.begin());
const UTF8 *Begin8 = Begin8Const;
UTF32 *Begin32 = &C;

// In lenient mode we will always end up with a "reasonable" value in C for
// non-empty input.
assert(!Buffer.empty());
ConvertUTF8toUTF32(&Begin8, reinterpret_cast<const UTF8 *>(Buffer.end()),
&Begin32, &C + 1, lenientConversion);
Buffer = Buffer.drop_front(Begin8 - Begin8Const);
return C;
}

static StringRef toUTF8(UTF32 C, MutableArrayRef<UTF8> Storage) {
const UTF32 *Begin32 = &C;
UTF8 *Begin8 = Storage.begin();

// The case-folded output should always be a valid unicode character, so use
// strict mode here.
ConversionResult CR = ConvertUTF32toUTF8(&Begin32, &C + 1, &Begin8,
Storage.end(), strictConversion);
assert(CR == conversionOK && "Case folding produced invalid char?");
(void)CR;
return StringRef(reinterpret_cast<char *>(Storage.begin()),
Begin8 - Storage.begin());
}

static UTF32 foldCharDwarf(UTF32 C) {
// DWARF v5 addition to the unicode folding rules.
// Fold "Latin Small Letter Dotless I" and "Latin Capital Letter I With Dot
// Above" into "i".
if (C == 0x130 || C == 0x131)
return 'i';
return sys::unicode::foldCharSimple(C);
}

static uint32_t caseFoldingDjbHashCharSlow(StringRef &Buffer, uint32_t H) {
UTF32 C = chopOneUTF32(Buffer);

C = foldCharDwarf(C);

std::array<UTF8, UNI_MAX_UTF8_BYTES_PER_CODE_POINT> Storage;
StringRef Folded = toUTF8(C, Storage);
return djbHash(Folded, H);
}

uint32_t llvm::caseFoldingDjbHash(StringRef Buffer, uint32_t H) {
while (!Buffer.empty()) {
unsigned char C = Buffer.front();
if (LLVM_LIKELY(C <= 0x7f)) {
// US-ASCII, encoded as one character in utf-8.
// This is by far the most common case, so handle this specially.
if (C >= 'A' && C <= 'Z')
C = 'a' + (C - 'A'); // fold uppercase into lowercase
H = djbHashChar(C, H);
Buffer = Buffer.drop_front();
continue;
}
H = caseFoldingDjbHashCharSlow(Buffer, H);
}
return H;
}

0 comments on commit 3b17b84

Please sign in to comment.