Skip to content

Commit

Permalink
[Clang] Implement CWG2640 Allow more characters in an n-char sequence
Browse files Browse the repository at this point in the history
Reviewed By: #clang-language-wg, aaron.ballman, tahonermann

Differential Revision: https://reviews.llvm.org/D138861
  • Loading branch information
cor3ntin committed Dec 13, 2022
1 parent d4fd275 commit dbfe446
Show file tree
Hide file tree
Showing 9 changed files with 96 additions and 48 deletions.
1 change: 1 addition & 0 deletions clang/docs/ReleaseNotes.rst
Expand Up @@ -705,6 +705,7 @@ C++2b Feature Support
- Implemented "char8_t Compatibility and Portability Fix" (`P2513R3 <https://wg21.link/P2513R3>`_).
This change was applied to C++20 as a Defect Report.
- Implemented "Permitting static constexpr variables in constexpr functions" (`P2647R1 <https://wg21.link/P2647R1>_`).
- Implemented `CWG2640 Allow more characters in an n-char sequence <https://wg21.link/CWG2640>_`.

CUDA/HIP Language Changes in Clang
----------------------------------
Expand Down
2 changes: 1 addition & 1 deletion clang/include/clang/Lex/Lexer.h
Expand Up @@ -772,7 +772,7 @@ class Lexer : public PreprocessorLexer {
llvm::Optional<uint32_t>
tryReadNumericUCN(const char *&StartPtr, const char *SlashLoc, Token *Result);
llvm::Optional<uint32_t> tryReadNamedUCN(const char *&StartPtr,
Token *Result);
const char *SlashLoc, Token *Result);

/// Read a universal character name.
///
Expand Down
75 changes: 42 additions & 33 deletions clang/lib/Lex/Lexer.cpp
Expand Up @@ -1194,15 +1194,16 @@ static char GetTrigraphCharForLetter(char Letter) {
/// whether trigraphs are enabled or not.
static char DecodeTrigraphChar(const char *CP, Lexer *L, bool Trigraphs) {
char Res = GetTrigraphCharForLetter(*CP);
if (!Res || !L) return Res;
if (!Res)
return Res;

if (!Trigraphs) {
if (!L->isLexingRawMode())
if (L && !L->isLexingRawMode())
L->Diag(CP-2, diag::trigraph_ignored);
return 0;
}

if (!L->isLexingRawMode())
if (L && !L->isLexingRawMode())
L->Diag(CP-2, diag::trigraph_converted) << StringRef(&Res, 1);
return Res;
}
Expand Down Expand Up @@ -3241,7 +3242,7 @@ llvm::Optional<uint32_t> Lexer::tryReadNumericUCN(const char *&StartPtr,
if (!Delimited)
break;
if (Diagnose)
Diag(BufferPtr, diag::warn_delimited_ucn_incomplete)
Diag(SlashLoc, diag::warn_delimited_ucn_incomplete)
<< StringRef(KindLoc, 1);
return std::nullopt;
}
Expand All @@ -3260,21 +3261,21 @@ llvm::Optional<uint32_t> Lexer::tryReadNumericUCN(const char *&StartPtr,

if (Count == 0) {
if (Diagnose)
Diag(StartPtr, FoundEndDelimiter ? diag::warn_delimited_ucn_empty
Diag(SlashLoc, FoundEndDelimiter ? diag::warn_delimited_ucn_empty
: diag::warn_ucn_escape_no_digits)
<< StringRef(KindLoc, 1);
return std::nullopt;
}

if (Delimited && Kind == 'U') {
if (Diagnose)
Diag(StartPtr, diag::err_hex_escape_no_digits) << StringRef(KindLoc, 1);
Diag(SlashLoc, diag::err_hex_escape_no_digits) << StringRef(KindLoc, 1);
return std::nullopt;
}

if (!Delimited && Count != NumHexDigits) {
if (Diagnose) {
Diag(BufferPtr, diag::warn_ucn_escape_incomplete);
Diag(SlashLoc, diag::warn_ucn_escape_incomplete);
// If the user wrote \U1234, suggest a fixit to \u.
if (Count == 4 && NumHexDigits == 8) {
CharSourceRange URange = makeCharRange(*this, KindLoc, KindLoc + 1);
Expand All @@ -3286,15 +3287,18 @@ llvm::Optional<uint32_t> Lexer::tryReadNumericUCN(const char *&StartPtr,
}

if (Delimited && PP) {
Diag(BufferPtr, PP->getLangOpts().CPlusPlus2b
? diag::warn_cxx2b_delimited_escape_sequence
: diag::ext_delimited_escape_sequence)
Diag(SlashLoc, PP->getLangOpts().CPlusPlus2b
? diag::warn_cxx2b_delimited_escape_sequence
: diag::ext_delimited_escape_sequence)
<< /*delimited*/ 0 << (PP->getLangOpts().CPlusPlus ? 1 : 0);
}

if (Result) {
Result->setFlag(Token::HasUCN);
if (CurPtr - StartPtr == (ptrdiff_t)(Count + 2 + (Delimited ? 2 : 0)))
// If the UCN contains either a trigraph or a line splicing,
// we need to call getAndAdvanceChar again to set the appropriate flags
// on Result.
if (CurPtr - StartPtr == (ptrdiff_t)(Count + 1 + (Delimited ? 2 : 0)))
StartPtr = CurPtr;
else
while (StartPtr != CurPtr)
Expand All @@ -3306,6 +3310,7 @@ llvm::Optional<uint32_t> Lexer::tryReadNumericUCN(const char *&StartPtr,
}

llvm::Optional<uint32_t> Lexer::tryReadNamedUCN(const char *&StartPtr,
const char *SlashLoc,
Token *Result) {
unsigned CharSize;
bool Diagnose = Result && !isLexingRawMode();
Expand All @@ -3319,7 +3324,7 @@ llvm::Optional<uint32_t> Lexer::tryReadNamedUCN(const char *&StartPtr,
C = getCharAndSize(CurPtr, CharSize);
if (C != '{') {
if (Diagnose)
Diag(StartPtr, diag::warn_ucn_escape_incomplete);
Diag(SlashLoc, diag::warn_ucn_escape_incomplete);
return std::nullopt;
}
CurPtr += CharSize;
Expand All @@ -3334,64 +3339,68 @@ llvm::Optional<uint32_t> Lexer::tryReadNamedUCN(const char *&StartPtr,
break;
}

if (!isAlphanumeric(C) && C != '_' && C != '-' && C != ' ')
if (isVerticalWhitespace(C))
break;
Buffer.push_back(C);
}

if (!FoundEndDelimiter || Buffer.empty()) {
if (Diagnose)
Diag(StartPtr, FoundEndDelimiter ? diag::warn_delimited_ucn_empty
Diag(SlashLoc, FoundEndDelimiter ? diag::warn_delimited_ucn_empty
: diag::warn_delimited_ucn_incomplete)
<< StringRef(KindLoc, 1);
return std::nullopt;
}

StringRef Name(Buffer.data(), Buffer.size());
llvm::Optional<char32_t> Res =
llvm::Optional<char32_t> Match =
llvm::sys::unicode::nameToCodepointStrict(Name);
llvm::Optional<llvm::sys::unicode::LooseMatchingResult> LooseMatch;
if (!Res) {
if (!isLexingRawMode()) {
Diag(StartPtr, diag::err_invalid_ucn_name)
<< StringRef(Buffer.data(), Buffer.size());
LooseMatch = llvm::sys::unicode::nameToCodepointLooseMatching(Name);
if (!Match) {
LooseMatch = llvm::sys::unicode::nameToCodepointLooseMatching(Name);
if (Diagnose) {
Diag(StartName, diag::err_invalid_ucn_name)
<< StringRef(Buffer.data(), Buffer.size())
<< makeCharRange(*this, StartName, CurPtr - CharSize);
if (LooseMatch) {
Diag(StartName, diag::note_invalid_ucn_name_loose_matching)
<< FixItHint::CreateReplacement(
makeCharRange(*this, StartName, CurPtr - CharSize),
LooseMatch->Name);
}
}
// When finding a match using Unicode loose matching rules
// recover after having emitted a diagnostic.
if (!LooseMatch)
return std::nullopt;
// We do not offer misspelled character names suggestions here
// as the set of what would be a valid suggestion depends on context,
// and we should not make invalid suggestions.
}

if (Diagnose && PP && !LooseMatch)
Diag(BufferPtr, PP->getLangOpts().CPlusPlus2b
? diag::warn_cxx2b_delimited_escape_sequence
: diag::ext_delimited_escape_sequence)
if (Diagnose && Match)
Diag(SlashLoc, PP->getLangOpts().CPlusPlus2b
? diag::warn_cxx2b_delimited_escape_sequence
: diag::ext_delimited_escape_sequence)
<< /*named*/ 1 << (PP->getLangOpts().CPlusPlus ? 1 : 0);

if (LooseMatch)
Res = LooseMatch->CodePoint;
// If no diagnostic has been emitted yet, likely because we are doing a
// tentative lexing, we do not want to recover here to make sure the token
// will not be incorrectly considered valid. This function will be called
// again and a diagnostic emitted then.
if (LooseMatch && Diagnose)
Match = LooseMatch->CodePoint;

if (Result) {
Result->setFlag(Token::HasUCN);
if (CurPtr - StartPtr == (ptrdiff_t)(Buffer.size() + 4))
// If the UCN contains either a trigraph or a line splicing,
// we need to call getAndAdvanceChar again to set the appropriate flags
// on Result.
if (CurPtr - StartPtr == (ptrdiff_t)(Buffer.size() + 3))
StartPtr = CurPtr;
else
while (StartPtr != CurPtr)
(void)getAndAdvanceChar(StartPtr, *Result);
} else {
StartPtr = CurPtr;
}
return *Res;
return Match ? llvm::Optional<uint32_t>(*Match) : std::nullopt;
}

uint32_t Lexer::tryReadUCN(const char *&StartPtr, const char *SlashLoc,
Expand All @@ -3403,7 +3412,7 @@ uint32_t Lexer::tryReadUCN(const char *&StartPtr, const char *SlashLoc,
if (Kind == 'u' || Kind == 'U')
CodePointOpt = tryReadNumericUCN(StartPtr, SlashLoc, Result);
else if (Kind == 'N')
CodePointOpt = tryReadNamedUCN(StartPtr, Result);
CodePointOpt = tryReadNamedUCN(StartPtr, SlashLoc, Result);

if (!CodePointOpt)
return 0;
Expand Down
9 changes: 4 additions & 5 deletions clang/lib/Lex/LiteralSupport.cpp
Expand Up @@ -548,11 +548,10 @@ static bool ProcessNamedUCNEscape(const char *ThisTokBegin,
return false;
}
ThisTokBuf++;
const char *ClosingBrace =
std::find_if_not(ThisTokBuf, ThisTokEnd, [](char C) {
return llvm::isAlnum(C) || llvm::isSpace(C) || C == '_' || C == '-';
});
bool Incomplete = ClosingBrace == ThisTokEnd || *ClosingBrace != '}';
const char *ClosingBrace = std::find_if(ThisTokBuf, ThisTokEnd, [](char C) {
return C == '}' || isVerticalWhitespace(C);
});
bool Incomplete = ClosingBrace == ThisTokEnd;
bool Empty = ClosingBrace == ThisTokBuf;
if (Incomplete || Empty) {
if (Diags) {
Expand Down
15 changes: 15 additions & 0 deletions clang/test/CXX/drs/dr26xx.cpp
Expand Up @@ -59,6 +59,21 @@ void TemplUse() {

// dr2636: na

namespace dr2640 { // dr2640: 16

int \N{Λ} = 0; //expected-error {{'Λ' is not a valid Unicode character name}} \
//expected-error {{expected unqualified-id}}
const char* emoji = "\N{🤡}"; // expected-error {{'🤡' is not a valid Unicode character name}} \
// expected-note 5{{did you mean}}

#define z(x) 0
#define dr2640_a z(
int x = dr2640_a\N{abc}); // expected-error {{'abc' is not a valid Unicode character name}}
int y = dr2640_a\N{LOTUS}); // expected-error {{character <U+1FAB7> not allowed in an identifier}} \
// expected-error {{use of undeclared identifier 'dr2640_a🪷'}} \
// expected-error {{extraneous ')' before ';'}}
}

// dr2642: na

namespace dr2644 { // dr2644: yes
Expand Down
5 changes: 5 additions & 0 deletions clang/test/Lexer/char-escapes-delimited.c
Expand Up @@ -96,6 +96,11 @@ void named(void) {
unsigned i = u'\N{GREEK CAPITAL LETTER DELTA}'; // ext-warning {{extension}} cxx2b-warning {{C++2b}}
char j = '\NN'; // expected-error {{expected '{' after '\N' escape sequence}} expected-warning {{multi-character character constant}}
unsigned k = u'\N{LOTUS'; // expected-error {{incomplete universal character name}}

const char* emoji = "\N{🤡}"; // expected-error {{'🤡' is not a valid Unicode character name}} \
// expected-note 5{{did you mean}}
const char* nested = "\N{\N{SPARKLE}}"; // expected-error {{'\N{SPARKLE' is not a valid Unicode character name}} \
// expected-note 5{{did you mean}}
}

void separators(void) {
Expand Down
1 change: 1 addition & 0 deletions clang/test/Lexer/unicode.c
Expand Up @@ -43,6 +43,7 @@ extern int \U00016AA2; // TANGSA LETTER GA - Added in Unicode 14
extern int \U0001E4D0; // 𞓐 NAG MUNDARI LETTER O - Added in Unicode 15
extern int _\N{TANGSA LETTER GA};
extern int _\N{TANGSALETTERGA}; // expected-error {{'TANGSALETTERGA' is not a valid Unicode character name}} \
// expected-error {{expected ';' after top level declarator}} \
// expected-note {{characters names in Unicode escape sequences are sensitive to case and whitespace}}


Expand Down
34 changes: 26 additions & 8 deletions clang/test/Preprocessor/ucn-pp-identifier.c
@@ -1,6 +1,6 @@
// RUN: %clang_cc1 %s -fsyntax-only -std=c99 -pedantic -verify=expected,ext -Wundef
// RUN: %clang_cc1 %s -fsyntax-only -x c++ -pedantic -verify=expected,ext -Wundef
// RUN: %clang_cc1 %s -fsyntax-only -x c++ -std=c++2b -pedantic -ftrigraphs -verify=expected,cxx2b -Wundef -Wpre-c++2b-compat
// RUN: %clang_cc1 %s -fsyntax-only -std=c99 -pedantic -verify=expected,ext -Wundef -DTRIGRAPHS=1
// RUN: %clang_cc1 %s -fsyntax-only -x c++ -pedantic -verify=expected,ext -Wundef -fno-trigraphs
// RUN: %clang_cc1 %s -fsyntax-only -x c++ -std=c++2b -pedantic -ftrigraphs -DTRIGRAPHS=1 -verify=expected,cxx2b -Wundef -Wpre-c++2b-compat
// RUN: %clang_cc1 %s -fsyntax-only -x c++ -pedantic -verify=expected,ext -Wundef -ftrigraphs -DTRIGRAPHS=1
// RUN: not %clang_cc1 %s -fsyntax-only -std=c99 -pedantic -Wundef 2>&1 | FileCheck -strict-whitespace %s

Expand Down Expand Up @@ -40,7 +40,6 @@
// ext-warning {{extension}} cxx2b-warning {{before C++2b}}
#define \N{WASTEBASKET} // expected-error {{macro name must be an identifier}} \
// ext-warning {{extension}} cxx2b-warning {{before C++2b}}

#define a\u0024

#if \u0110 // expected-warning {{is not defined, evaluates to 0}}
Expand Down Expand Up @@ -121,20 +120,39 @@ C 1
#define \u{123456789} // expected-error {{hex escape sequence out of range}} expected-error {{macro name must be an identifier}}
#define \u{ // expected-warning {{incomplete delimited universal character name; treating as '\' 'u' '{' identifier}} expected-error {{macro name must be an identifier}}
#define \u{fgh} // expected-warning {{incomplete delimited universal character name; treating as '\' 'u' '{' identifier}} expected-error {{macro name must be an identifier}}
#define \N{ // expected-warning {{incomplete delimited universal character name; treating as '\' 'N' '{' identifier}} expected-error {{macro name must be an identifier}}
#define \N{
// expected-warning@-1 {{incomplete delimited universal character name; treating as '\' 'N' '{' identifier}}
// expected-error@-2 {{macro name must be an identifier}}
#define \N{} // expected-warning {{empty delimited universal character name; treating as '\' 'N' '{' '}'}} expected-error {{macro name must be an identifier}}
#define \N{NOTATHING} // expected-error {{'NOTATHING' is not a valid Unicode character name}} \
// expected-error {{macro name must be an identifier}}
#define \NN // expected-warning {{incomplete universal character name; treating as '\' followed by identifier}} expected-error {{macro name must be an identifier}}
#define \N{GREEK_SMALL-LETTERALPHA} // expected-error {{'GREEK_SMALL-LETTERALPHA' is not a valid Unicode character name}} \
// expected-note {{characters names in Unicode escape sequences are sensitive to case and whitespaces}}
#define \N{🤡} // expected-error {{'🤡' is not a valid Unicode character name}} \
// expected-error {{macro name must be an identifier}}

#define CONCAT(A, B) A##B
int CONCAT(\N{GREEK, CAPITALLETTERALPHA}); // expected-error{{expected}} \
// expected-warning {{incomplete delimited universal character name}}
int CONCAT(\N{GREEK
, CAPITALLETTERALPHA});
// expected-error@-2 {{expected}} \
// expected-warning@-2 {{incomplete delimited universal character name}}

int \N{\
LATIN CAPITAL LETTER A WITH GRAVE};
//ext-warning@-2 {{extension}} cxx2b-warning@-2 {{before C++2b}}

#ifdef TRIGRAPHS
int \N??<GREEK CAPITAL LETTER ALPHA??> = 0; // expected-warning{{extension}} cxx2b-warning {{before C++2b}} \
int \N??<GREEK CAPITAL LETTER ALPHA??> = 0; // cxx2b-warning {{before C++2b}} \
//ext-warning {{extension}}\
// expected-warning 2{{trigraph converted}}

int a\N{LATIN CAPITAL LETTER A WITH GRAVE??>; // expected-warning {{trigraph converted}}
#endif

#ifndef TRIGRAPHS
int a\N{LATIN CAPITAL LETTER A WITH GRAVE??>;
// expected-warning@-1 {{trigraph ignored}}\
// expected-warning@-1 {{incomplete}}\
// expected-error@-1 {{expected ';' after top level declarator}}
#endif
2 changes: 1 addition & 1 deletion clang/www/cxx_dr_status.html
Expand Up @@ -15647,7 +15647,7 @@ <h2 id="cxxdr">C++ defect report implementation status</h2>
<td><a href="https://wg21.link/cwg2640">2640</a></td>
<td>accepted</td>
<td>Allow more characters in an n-char sequence</td>
<td class="none" align="center">Unknown</td>
<td class="unreleased" align="center">Clang 16</td>
</tr>
<tr id="2641">
<td><a href="https://wg21.link/cwg2641">2641</a></td>
Expand Down

0 comments on commit dbfe446

Please sign in to comment.