Skip to content

Commit

Permalink
Fix handling of medial hyphens in Unicode Names.
Browse files Browse the repository at this point in the history
In a Unicode name was stored in a way that caused
a medial hyphen to be at the end of a a chunk, it would not
be properly ignored by the loose matching algorithm.

For example if `LEFT-TO-RIGHT OVERRIDE` was stored as
`LEFT-` [...], the `-` would not be ignored.

The generators now ensures nodes are not cut accross
medial hyphen boundaries.

Fixes #64161

Differential Revision: https://reviews.llvm.org/D156518
  • Loading branch information
cor3ntin committed Jul 28, 2023
1 parent a428b5a commit 68410fb
Show file tree
Hide file tree
Showing 7 changed files with 20,940 additions and 20,841 deletions.
4 changes: 4 additions & 0 deletions clang/docs/ReleaseNotes.rst
Original file line number Diff line number Diff line change
Expand Up @@ -117,6 +117,10 @@ Bug Fixes to C++ Support
This limit can be modified by `-fconstexpr-steps`.
(`#63562 <https://github.com/llvm/llvm-project/issues/63562>`_)

- Fix a crash caused by some named unicode escape sequences designating
a Unicode character whose name contains a ``-``.
(`Fixes #64161 <https://github.com/llvm/llvm-project/issues/64161>_`)

Bug Fixes to AST Handling
^^^^^^^^^^^^^^^^^^^^^^^^^

Expand Down
6 changes: 3 additions & 3 deletions clang/lib/Lex/LiteralSupport.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -385,10 +385,10 @@ void clang::expandUCNs(SmallVectorImpl<char> &Buf, StringRef Input) {
++I;
auto Delim = std::find(I, Input.end(), '}');
assert(Delim != Input.end());
StringRef Name(I, std::distance(I, Delim));
std::optional<llvm::sys::unicode::LooseMatchingResult> Res =
llvm::sys::unicode::nameToCodepointLooseMatching(
StringRef(I, std::distance(I, Delim)));
assert(Res);
llvm::sys::unicode::nameToCodepointLooseMatching(Name);
assert(Res && "could not find a codepoint that was previously found");
CodePoint = Res->CodePoint;
assert(CodePoint != 0xFFFFFFFF);
appendCodePoint(CodePoint, Buf);
Expand Down
3 changes: 3 additions & 0 deletions clang/test/Preprocessor/ucn-pp-identifier.c
Original file line number Diff line number Diff line change
Expand Up @@ -159,3 +159,6 @@ int a\N{LATIN CAPITAL LETTER A WITH GRAVE??>;
// expected-warning@-1 {{incomplete}}\
// expected-error@-1 {{expected unqualified-id}}
#endif

// GH64161
int A\N{LEFT-TO-RIGHT OVERRIDE}; // expected-error {{character <U+202D> not allowed in an identifier}}
41 changes: 19 additions & 22 deletions llvm/lib/Support/UnicodeNameToCodepoint.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -119,7 +119,7 @@ static Node readNode(uint32_t Offset, const Node *Parent = nullptr) {

static bool startsWith(StringRef Name, StringRef Needle, bool Strict,
std::size_t &Consummed, char &PreviousCharInName,
char &PreviousCharInNeedle, bool IsPrefix = false) {
bool IsPrefix = false) {

Consummed = 0;
if (Strict) {
Expand All @@ -135,18 +135,18 @@ static bool startsWith(StringRef Name, StringRef Needle, bool Strict,
auto NeedlePos = Needle.begin();

char PreviousCharInNameOrigin = PreviousCharInName;
char PreviousCharInNeedleOrigin = PreviousCharInNeedle;

char PreviousCharInNeedle = *Needle.begin();
auto IgnoreSpaces = [](auto It, auto End, char &PreviousChar,
bool IgnoreEnd = false) {
bool IsPrefix = false) {
while (It != End) {
const auto Next = std::next(It);
// Ignore spaces, underscore, medial hyphens
// https://unicode.org/reports/tr44/#UAX44-LM2.
// The generator ensures a needle never ends (or starts) by a medial
// hyphen https://unicode.org/reports/tr44/#UAX44-LM2.
bool Ignore =
*It == ' ' || *It == '_' ||
(*It == '-' && isAlnum(PreviousChar) &&
((Next != End && isAlnum(*Next)) || (Next == End && IgnoreEnd)));
((Next != End && isAlnum(*Next)) || (Next == End && IsPrefix)));
PreviousChar = *It;
if (!Ignore)
break;
Expand All @@ -171,20 +171,18 @@ static bool startsWith(StringRef Name, StringRef Needle, bool Strict,
Consummed = std::distance(Name.begin(), NamePos);
if (NeedlePos != Needle.end()) {
PreviousCharInName = PreviousCharInNameOrigin;
PreviousCharInNeedle = PreviousCharInNeedleOrigin;
}
return NeedlePos == Needle.end();
}

static std::tuple<Node, bool, uint32_t>
compareNode(uint32_t Offset, StringRef Name, bool Strict,
char PreviousCharInName, char PreviousCharInNeedle,
BufferType &Buffer, const Node *Parent = nullptr) {
char PreviousCharInName, BufferType &Buffer,
const Node *Parent = nullptr) {
Node N = readNode(Offset, Parent);
std::size_t Consummed = 0;
bool DoesStartWith =
N.IsRoot || startsWith(Name, N.Name, Strict, Consummed,
PreviousCharInName, PreviousCharInNeedle);
bool DoesStartWith = N.IsRoot || startsWith(Name, N.Name, Strict, Consummed,
PreviousCharInName);
if (!DoesStartWith)
return std::make_tuple(N, false, 0);

Expand All @@ -199,7 +197,7 @@ compareNode(uint32_t Offset, StringRef Name, bool Strict,
uint32_t Value;
std::tie(C, Matches, Value) =
compareNode(ChildOffset, Name.substr(Consummed), Strict,
PreviousCharInName, PreviousCharInNeedle, Buffer, &N);
PreviousCharInName, Buffer, &N);
if (Matches) {
std::reverse_copy(C.Name.begin(), C.Name.end(),
std::back_inserter(Buffer));
Expand All @@ -215,7 +213,7 @@ compareNode(uint32_t Offset, StringRef Name, bool Strict,

static std::tuple<Node, bool, uint32_t>
compareNode(uint32_t Offset, StringRef Name, bool Strict, BufferType &Buffer) {
return compareNode(Offset, Name, Strict, 0, 0, Buffer);
return compareNode(Offset, Name, Strict, 0, Buffer);
}

// clang-format off
Expand Down Expand Up @@ -262,7 +260,6 @@ static std::size_t findSyllable(StringRef Name, bool Strict,
char &PreviousInName, int &Pos, int Column) {
assert(Column == 0 || Column == 1 || Column == 2);
static std::size_t CountPerColumn[] = {LCount, VCount, TCount};
char NeedleStart = 0;
int Len = -1;
int Prev = PreviousInName;
for (std::size_t I = 0; I < CountPerColumn[Column]; I++) {
Expand All @@ -271,8 +268,8 @@ static std::size_t findSyllable(StringRef Name, bool Strict,
continue;
std::size_t Consummed = 0;
char PreviousInNameCopy = PreviousInName;
bool DoesStartWith = startsWith(Name, Syllable, Strict, Consummed,
PreviousInNameCopy, NeedleStart);
bool DoesStartWith =
startsWith(Name, Syllable, Strict, Consummed, PreviousInNameCopy);
if (!DoesStartWith)
continue;
Len = Consummed;
Expand All @@ -290,9 +287,9 @@ nameToHangulCodePoint(StringRef Name, bool Strict, BufferType &Buffer) {
Buffer.clear();
// Hangul Syllable Decomposition
std::size_t Consummed = 0;
char NameStart = 0, NeedleStart = 0;
bool DoesStartWith = startsWith(Name, "HANGUL SYLLABLE ", Strict, Consummed,
NameStart, NeedleStart);
char NameStart = 0;
bool DoesStartWith =
startsWith(Name, "HANGUL SYLLABLE ", Strict, Consummed, NameStart);
if (!DoesStartWith)
return std::nullopt;
Name = Name.substr(Consummed);
Expand Down Expand Up @@ -348,9 +345,9 @@ nameToGeneratedCodePoint(StringRef Name, bool Strict, BufferType &Buffer) {
for (auto &&Item : GeneratedNamesDataTable) {
Buffer.clear();
std::size_t Consummed = 0;
char NameStart = 0, NeedleStart = 0;
char NameStart = 0;
bool DoesStartWith = startsWith(Name, Item.Prefix, Strict, Consummed,
NameStart, NeedleStart, /*isPrefix*/ true);
NameStart, /*IsPrefix=*/true);
if (!DoesStartWith)
continue;
auto Number = Name.substr(Consummed);
Expand Down

0 comments on commit 68410fb

Please sign in to comment.