Skip to content

Commit

Permalink
[Clang] Handle non-ASCII after line splicing
Browse files Browse the repository at this point in the history
int a\
ス;

Failed to be parsed as a valid identifier.

Fixes #65156

Reviewed By: tahonermann

Differential Revision: https://reviews.llvm.org/D159345
  • Loading branch information
cor3ntin committed Sep 6, 2023
1 parent 89a81ec commit 3eb67d2
Show file tree
Hide file tree
Showing 4 changed files with 70 additions and 18 deletions.
2 changes: 2 additions & 0 deletions clang/docs/ReleaseNotes.rst
Original file line number Diff line number Diff line change
Expand Up @@ -214,6 +214,8 @@ Bug Fixes in This Version
(`#64987 <https://github.com/llvm/llvm-project/issues/64987>`_)
- Support MSVC predefined macro expressions in constant expressions and in
local structs.
- Correctly parse non-ascii identifiers that appear immediately after a line splicing
(`#65156 <https://github.com/llvm/llvm-project/issues/65156>`_`)

Bug Fixes to Compiler Builtins
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
Expand Down
3 changes: 2 additions & 1 deletion clang/include/clang/Lex/Lexer.h
Original file line number Diff line number Diff line change
Expand Up @@ -805,9 +805,10 @@ class Lexer : public PreprocessorLexer {
/// Try to consume an identifier character encoded in UTF-8.
/// \param CurPtr Points to the start of the (potential) UTF-8 code unit
/// sequence. On success, updated to point past the end of it.
/// \param Result The token being formed.
/// \return \c true if a UTF-8 sequence mapping to an acceptable identifier
/// character was lexed, \c false otherwise.
bool tryConsumeIdentifierUTF8Char(const char *&CurPtr);
bool tryConsumeIdentifierUTF8Char(const char *&CurPtr, Token &Result);
};

} // namespace clang
Expand Down
45 changes: 28 additions & 17 deletions clang/lib/Lex/Lexer.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -1750,15 +1750,21 @@ bool Lexer::tryConsumeIdentifierUCN(const char *&CurPtr, unsigned Size,
return true;
}

bool Lexer::tryConsumeIdentifierUTF8Char(const char *&CurPtr) {
const char *UnicodePtr = CurPtr;
bool Lexer::tryConsumeIdentifierUTF8Char(const char *&CurPtr, Token &Result) {
llvm::UTF32 CodePoint;
llvm::ConversionResult Result =
llvm::convertUTF8Sequence((const llvm::UTF8 **)&UnicodePtr,
(const llvm::UTF8 *)BufferEnd,
&CodePoint,
llvm::strictConversion);
if (Result != llvm::conversionOK)

// If a UTF-8 codepoint appears immediately after an escaped new line,
// CurPtr may point to the splicing \ on the preceding line,
// so we need to skip it.
unsigned FirstCodeUnitSize;
getCharAndSize(CurPtr, FirstCodeUnitSize);
const char *CharStart = CurPtr + FirstCodeUnitSize - 1;
const char *UnicodePtr = CharStart;

llvm::ConversionResult ConvResult = llvm::convertUTF8Sequence(
(const llvm::UTF8 **)&UnicodePtr, (const llvm::UTF8 *)BufferEnd,
&CodePoint, llvm::strictConversion);
if (ConvResult != llvm::conversionOK)
return false;

bool IsExtension = false;
Expand All @@ -1771,21 +1777,26 @@ bool Lexer::tryConsumeIdentifierUTF8Char(const char *&CurPtr) {
!PP->isPreprocessedOutput())
diagnoseInvalidUnicodeCodepointInIdentifier(
PP->getDiagnostics(), LangOpts, CodePoint,
makeCharRange(*this, CurPtr, UnicodePtr), /*IsFirst=*/false);
makeCharRange(*this, CharStart, UnicodePtr), /*IsFirst=*/false);
// We got a unicode codepoint that is neither a space nor a
// a valid identifier part. Carry on as if the codepoint was
// valid for recovery purposes.
} else if (!isLexingRawMode()) {
if (IsExtension)
diagnoseExtensionInIdentifier(PP->getDiagnostics(), CodePoint,
makeCharRange(*this, CurPtr, UnicodePtr));
diagnoseExtensionInIdentifier(
PP->getDiagnostics(), CodePoint,
makeCharRange(*this, CharStart, UnicodePtr));
maybeDiagnoseIDCharCompat(PP->getDiagnostics(), CodePoint,
makeCharRange(*this, CurPtr, UnicodePtr),
makeCharRange(*this, CharStart, UnicodePtr),
/*IsFirst=*/false);
maybeDiagnoseUTF8Homoglyph(PP->getDiagnostics(), CodePoint,
makeCharRange(*this, CurPtr, UnicodePtr));
makeCharRange(*this, CharStart, UnicodePtr));
}

// Once we sucessfully parsed some UTF-8,
// calling ConsumeChar ensures the NeedsCleaning flag is set on the token
// being lexed, and that warnings about trailing spaces are emitted.
ConsumeChar(CurPtr, FirstCodeUnitSize, Result);
CurPtr = UnicodePtr;
return true;
}
Expand Down Expand Up @@ -1865,7 +1876,7 @@ bool Lexer::LexIdentifierContinue(Token &Result, const char *CurPtr) {
}
if (C == '\\' && tryConsumeIdentifierUCN(CurPtr, Size, Result))
continue;
if (!isASCII(C) && tryConsumeIdentifierUTF8Char(CurPtr))
if (!isASCII(C) && tryConsumeIdentifierUTF8Char(CurPtr, Result))
continue;
// Neither an expected Unicode codepoint nor a UCN.
break;
Expand Down Expand Up @@ -1985,7 +1996,7 @@ bool Lexer::LexNumericConstant(Token &Result, const char *CurPtr) {
// If we have a UCN or UTF-8 character (perhaps in a ud-suffix), continue.
if (C == '\\' && tryConsumeIdentifierUCN(CurPtr, Size, Result))
return LexNumericConstant(Result, CurPtr);
if (!isASCII(C) && tryConsumeIdentifierUTF8Char(CurPtr))
if (!isASCII(C) && tryConsumeIdentifierUTF8Char(CurPtr, Result))
return LexNumericConstant(Result, CurPtr);

// Update the location of token as well as BufferPtr.
Expand All @@ -2009,7 +2020,7 @@ const char *Lexer::LexUDSuffix(Token &Result, const char *CurPtr,
if (!isAsciiIdentifierStart(C)) {
if (C == '\\' && tryConsumeIdentifierUCN(CurPtr, Size, Result))
Consumed = true;
else if (!isASCII(C) && tryConsumeIdentifierUTF8Char(CurPtr))
else if (!isASCII(C) && tryConsumeIdentifierUTF8Char(CurPtr, Result))
Consumed = true;
else
return CurPtr;
Expand Down Expand Up @@ -2079,7 +2090,7 @@ const char *Lexer::LexUDSuffix(Token &Result, const char *CurPtr,
if (isAsciiIdentifierContinue(C)) {
CurPtr = ConsumeChar(CurPtr, Size, Result);
} else if (C == '\\' && tryConsumeIdentifierUCN(CurPtr, Size, Result)) {
} else if (!isASCII(C) && tryConsumeIdentifierUTF8Char(CurPtr)) {
} else if (!isASCII(C) && tryConsumeIdentifierUTF8Char(CurPtr, Result)) {
} else
break;
}
Expand Down
38 changes: 38 additions & 0 deletions clang/test/Lexer/escape_newline_unicode.c
Original file line number Diff line number Diff line change
@@ -0,0 +1,38 @@
// RUN: %clang_cc1 -verify=expected,c -x c -Wunused %s
// RUN: %clang_cc1 -verify=expected,cpp -x c++ -Wunused %s

void gh65156(void) {

int a\
= 42;
// expected-warning@-2 {{unused variable 'aス'}}

int b\
\
= 42;
// expected-warning@-2 {{backslash and newline separated by space}}
// expected-warning@-4 {{backslash and newline separated by space}}
// expected-warning@-5 {{unused variable 'bス'}}

int \
= 42;
// expected-warning@-2 {{unused variable 'スス'}}

int \
= 42;
// expected-warning@-2 {{unused variable 'ス'}}

}

void gh65156_err(void) {

int \
= 0;
// cpp-error@-2 {{expected unqualified-id}}
// c-error@-3 {{expected identifier}}


int a\
= 0;
// expected-error@-1 {{character <U+274C> not allowed in an identifier}}
}

0 comments on commit 3eb67d2

Please sign in to comment.