Skip to content

Commit

Permalink
Implement P1949
Browse files Browse the repository at this point in the history
This adds the Unicode 13 data for XID_Start and XID_Continue.
The definition of valid identifier is changed in all C++ modes
as P1949 (https://wg21.link/p1949) was accepted by WG21 as a defect
report.
  • Loading branch information
cor3ntin authored and AaronBallman committed Aug 18, 2021
1 parent f22e586 commit 4e80636
Show file tree
Hide file tree
Showing 16 changed files with 516 additions and 241 deletions.
5 changes: 5 additions & 0 deletions clang/include/clang/Basic/CharInfo.h
Expand Up @@ -43,6 +43,11 @@ LLVM_READNONE inline bool isASCII(char c) {
return static_cast<unsigned char>(c) <= 127;
}

LLVM_READNONE inline bool isASCII(unsigned char c) { return c <= 127; }

/// Returns true if this is an ASCII character.
LLVM_READNONE inline bool isASCII(uint32_t c) { return c <= 127; }

/// Returns true if this is a valid first character of a C identifier,
/// which is [a-zA-Z_].
LLVM_READONLY inline bool isIdentifierHead(unsigned char c,
Expand Down
9 changes: 4 additions & 5 deletions clang/include/clang/Basic/DiagnosticLexKinds.td
Expand Up @@ -113,8 +113,10 @@ def warn_four_char_character_literal : Warning<
// Unicode and UCNs
def err_invalid_utf8 : Error<
"source file is not valid UTF-8">;
def err_non_ascii : Error<
"non-ASCII characters are not allowed outside of literals and identifiers">;
def err_character_not_allowed : Error<
"unexpected character <U+%0>">;
def err_character_not_allowed_identifier : Error<
"character <U+%0> not allowed %select{in|at the start of}1 an identifier">;
def ext_unicode_whitespace : ExtWarn<
"treating Unicode character as whitespace">,
InGroup<DiagGroup<"unicode-whitespace">>;
Expand Down Expand Up @@ -150,9 +152,6 @@ def warn_c99_compat_unicode_id : Warning<
"%select{using this character in an identifier|starting an identifier with "
"this character}0 is incompatible with C99">,
InGroup<C99Compat>, DefaultIgnore;
def warn_cxx98_compat_unicode_id : Warning<
"using this character in an identifier is incompatible with C++98">,
InGroup<CXX98Compat>, DefaultIgnore;

def warn_cxx98_compat_literal_ucn_escape_basic_scs : Warning<
"specifying character '%0' with a universal character name "
Expand Down
128 changes: 91 additions & 37 deletions clang/lib/Lex/Lexer.cpp
Expand Up @@ -1446,19 +1446,30 @@ void Lexer::SetByteOffset(unsigned Offset, bool StartOfLine) {
IsAtPhysicalStartOfLine = StartOfLine;
}

static bool isUnicodeWhitespace(uint32_t Codepoint) {
static const llvm::sys::UnicodeCharSet UnicodeWhitespaceChars(
UnicodeWhitespaceCharRanges);
return UnicodeWhitespaceChars.contains(Codepoint);
}

static bool isAllowedIDChar(uint32_t C, const LangOptions &LangOpts) {
if (LangOpts.AsmPreprocessor) {
return false;
} else if (LangOpts.DollarIdents && '$' == C) {
return true;
} else if (LangOpts.CPlusPlus11 || LangOpts.C11) {
} else if (LangOpts.CPlusPlus) {
// A non-leading codepoint must have the XID_Continue property.
// XIDContinueRanges doesn't contains characters also in XIDStartRanges,
// so we need to check both tables.
// '_' doesn't have the XID_Continue property but is allowed in C++.
static const llvm::sys::UnicodeCharSet XIDStartChars(XIDStartRanges);
static const llvm::sys::UnicodeCharSet XIDContinueChars(XIDContinueRanges);
return C == '_' || XIDStartChars.contains(C) ||
XIDContinueChars.contains(C);
} else if (LangOpts.C11) {
static const llvm::sys::UnicodeCharSet C11AllowedIDChars(
C11AllowedIDCharRanges);
return C11AllowedIDChars.contains(C);
} else if (LangOpts.CPlusPlus) {
static const llvm::sys::UnicodeCharSet CXX03AllowedIDChars(
CXX03AllowedIDCharRanges);
return CXX03AllowedIDChars.contains(C);
} else {
static const llvm::sys::UnicodeCharSet C99AllowedIDChars(
C99AllowedIDCharRanges);
Expand All @@ -1467,20 +1478,24 @@ static bool isAllowedIDChar(uint32_t C, const LangOptions &LangOpts) {
}

static bool isAllowedInitiallyIDChar(uint32_t C, const LangOptions &LangOpts) {
assert(isAllowedIDChar(C, LangOpts));
if (LangOpts.AsmPreprocessor) {
return false;
} else if (LangOpts.CPlusPlus11 || LangOpts.C11) {
}
if (LangOpts.CPlusPlus) {
static const llvm::sys::UnicodeCharSet XIDStartChars(XIDStartRanges);
// '_' doesn't have the XID_Start property but is allowed in C++.
return C == '_' || XIDStartChars.contains(C);
}
if (!isAllowedIDChar(C, LangOpts))
return false;
if (LangOpts.C11) {
static const llvm::sys::UnicodeCharSet C11DisallowedInitialIDChars(
C11DisallowedInitialIDCharRanges);
return !C11DisallowedInitialIDChars.contains(C);
} else if (LangOpts.CPlusPlus) {
return true;
} else {
static const llvm::sys::UnicodeCharSet C99DisallowedInitialIDChars(
C99DisallowedInitialIDCharRanges);
return !C99DisallowedInitialIDChars.contains(C);
}
static const llvm::sys::UnicodeCharSet C99DisallowedInitialIDChars(
C99DisallowedInitialIDCharRanges);
return !C99DisallowedInitialIDChars.contains(C);
}

static inline CharSourceRange makeCharRange(Lexer &L, const char *Begin,
Expand Down Expand Up @@ -1512,16 +1527,6 @@ static void maybeDiagnoseIDCharCompat(DiagnosticsEngine &Diags, uint32_t C,
<< CannotStartIdentifier;
}
}

// Check C++98 compatibility.
if (!Diags.isIgnored(diag::warn_cxx98_compat_unicode_id, Range.getBegin())) {
static const llvm::sys::UnicodeCharSet CXX03AllowedIDChars(
CXX03AllowedIDCharRanges);
if (!CXX03AllowedIDChars.contains(C)) {
Diags.Report(Range.getBegin(), diag::warn_cxx98_compat_unicode_id)
<< Range;
}
}
}

/// After encountering UTF-8 character C and interpreting it as an identifier
Expand Down Expand Up @@ -1608,14 +1613,55 @@ static void maybeDiagnoseUTF8Homoglyph(DiagnosticsEngine &Diags, uint32_t C,
}
}

static void diagnoseInvalidUnicodeCodepointInIdentifier(
DiagnosticsEngine &Diags, const LangOptions &LangOpts, uint32_t CodePoint,
CharSourceRange Range, bool IsFirst) {
if (isASCII(CodePoint))
return;

bool IsIDStart = isAllowedInitiallyIDChar(CodePoint, LangOpts);
bool IsIDContinue = IsIDStart || isAllowedIDChar(CodePoint, LangOpts);

if ((IsFirst && IsIDStart) || (!IsFirst && IsIDContinue))
return;

bool InvalidOnlyAtStart = IsFirst && !IsIDStart && IsIDContinue;

llvm::SmallString<5> CharBuf;
llvm::raw_svector_ostream CharOS(CharBuf);
llvm::write_hex(CharOS, CodePoint, llvm::HexPrintStyle::Upper, 4);

if (!IsFirst || InvalidOnlyAtStart) {
Diags.Report(Range.getBegin(), diag::err_character_not_allowed_identifier)
<< Range << CharBuf << int(InvalidOnlyAtStart)
<< FixItHint::CreateRemoval(Range);
} else {
Diags.Report(Range.getBegin(), diag::err_character_not_allowed)
<< Range << CharBuf << FixItHint::CreateRemoval(Range);
}
}

bool Lexer::tryConsumeIdentifierUCN(const char *&CurPtr, unsigned Size,
Token &Result) {
const char *UCNPtr = CurPtr + Size;
uint32_t CodePoint = tryReadUCN(UCNPtr, CurPtr, /*Token=*/nullptr);
if (CodePoint == 0 || !isAllowedIDChar(CodePoint, LangOpts))
if (CodePoint == 0) {
return false;
}

if (!isLexingRawMode())
if (!isAllowedIDChar(CodePoint, LangOpts)) {
if (isASCII(CodePoint) || isUnicodeWhitespace(CodePoint))
return false;
if (!isLexingRawMode())
diagnoseInvalidUnicodeCodepointInIdentifier(
PP->getDiagnostics(), LangOpts, CodePoint,
makeCharRange(*this, CurPtr, UCNPtr),
/*IsFirst=*/false);

// We got a unicode codepoint that is neither a space nor a
// a valid identifier part.
// Carry on as if the codepoint was valid for recovery purposes.
} else if (!isLexingRawMode())
maybeDiagnoseIDCharCompat(PP->getDiagnostics(), CodePoint,
makeCharRange(*this, CurPtr, UCNPtr),
/*IsFirst=*/false);
Expand All @@ -1638,11 +1684,21 @@ bool Lexer::tryConsumeIdentifierUTF8Char(const char *&CurPtr) {
(const llvm::UTF8 *)BufferEnd,
&CodePoint,
llvm::strictConversion);
if (Result != llvm::conversionOK ||
!isAllowedIDChar(static_cast<uint32_t>(CodePoint), LangOpts))
if (Result != llvm::conversionOK)
return false;

if (!isLexingRawMode()) {
if (!isAllowedIDChar(static_cast<uint32_t>(CodePoint), LangOpts)) {
if (isASCII(CodePoint) || isUnicodeWhitespace(CodePoint))
return false;

if (!isLexingRawMode())
diagnoseInvalidUnicodeCodepointInIdentifier(
PP->getDiagnostics(), LangOpts, CodePoint,
makeCharRange(*this, CurPtr, UnicodePtr), /*IsFirst=*/false);
// We got a unicode codepoint that is neither a space nor a
// a valid identifier part. Carry on as if the codepoint was
// valid for recovery purposes.
} else if (!isLexingRawMode()) {
maybeDiagnoseIDCharCompat(PP->getDiagnostics(), CodePoint,
makeCharRange(*this, CurPtr, UnicodePtr),
/*IsFirst=*/false);
Expand Down Expand Up @@ -3136,10 +3192,8 @@ uint32_t Lexer::tryReadUCN(const char *&StartPtr, const char *SlashLoc,

bool Lexer::CheckUnicodeWhitespace(Token &Result, uint32_t C,
const char *CurPtr) {
static const llvm::sys::UnicodeCharSet UnicodeWhitespaceChars(
UnicodeWhitespaceCharRanges);
if (!isLexingRawMode() && !PP->isPreprocessedOutput() &&
UnicodeWhitespaceChars.contains(C)) {
isUnicodeWhitespace(C)) {
Diag(BufferPtr, diag::ext_unicode_whitespace)
<< makeCharRange(*this, BufferPtr, CurPtr);

Expand All @@ -3150,7 +3204,7 @@ bool Lexer::CheckUnicodeWhitespace(Token &Result, uint32_t C,
}

bool Lexer::LexUnicode(Token &Result, uint32_t C, const char *CurPtr) {
if (isAllowedIDChar(C, LangOpts) && isAllowedInitiallyIDChar(C, LangOpts)) {
if (isAllowedInitiallyIDChar(C, LangOpts)) {
if (!isLexingRawMode() && !ParsingPreprocessorDirective &&
!PP->isPreprocessedOutput()) {
maybeDiagnoseIDCharCompat(PP->getDiagnostics(), C,
Expand All @@ -3165,8 +3219,8 @@ bool Lexer::LexUnicode(Token &Result, uint32_t C, const char *CurPtr) {
}

if (!isLexingRawMode() && !ParsingPreprocessorDirective &&
!PP->isPreprocessedOutput() &&
!isASCII(*BufferPtr) && !isAllowedIDChar(C, LangOpts)) {
!PP->isPreprocessedOutput() && !isASCII(*BufferPtr) &&
!isAllowedInitiallyIDChar(C, LangOpts) && !isUnicodeWhitespace(C)) {
// Non-ASCII characters tend to creep into source code unintentionally.
// Instead of letting the parser complain about the unknown token,
// just drop the character.
Expand All @@ -3176,9 +3230,9 @@ bool Lexer::LexUnicode(Token &Result, uint32_t C, const char *CurPtr) {
// loophole in the mapping of Unicode characters to basic character set
// characters that allows us to map these particular characters to, say,
// whitespace.
Diag(BufferPtr, diag::err_non_ascii)
<< FixItHint::CreateRemoval(makeCharRange(*this, BufferPtr, CurPtr));

diagnoseInvalidUnicodeCodepointInIdentifier(
PP->getDiagnostics(), LangOpts, C,
makeCharRange(*this, BufferPtr, CurPtr), /*IsStart*/ true);
BufferPtr = CurPtr;
return false;
}
Expand Down

0 comments on commit 4e80636

Please sign in to comment.