Skip to content

Commit

Permalink
[Clang][C++23][WIP] P2071 Named universal character escapes
Browse files Browse the repository at this point in the history
! Missing tests, some cleanup still needed.

* Add a function in LLVM to map a name to a codepoint.
This using a try to minimize memory usage,
while allowing fast access.

* Add an utility to regenerate this data.

* Support named escape sequences with an extension warning.
I have not yet dealt with C++23 conformance extension warning,

Differential Revision: https://reviews.llvm.org/D123064
  • Loading branch information
cor3ntin committed Apr 5, 2022
1 parent 6cf10b7 commit 8f777e2
Show file tree
Hide file tree
Showing 20 changed files with 21,520 additions and 60 deletions.
10 changes: 7 additions & 3 deletions clang/include/clang/Basic/DiagnosticLexKinds.td
Expand Up @@ -128,7 +128,7 @@ def warn_utf8_symbol_zero_width : Warning<
"some environments">, InGroup<DiagGroup<"unicode-zero-width">>;

def ext_delimited_escape_sequence : Extension<
"delimited escape sequences are a Clang extension">,
"%select{delimited|named}0 escape sequences are a Clang extension">,
InGroup<DiagGroup<"delimited-escape-sequence-extension">>;
def err_delimited_escape_empty : Error<
"delimited escape sequence cannot be empty">;
Expand All @@ -138,17 +138,21 @@ def err_delimited_escape_invalid : Error<
"invalid digit '%0' in escape sequence">;
def err_hex_escape_no_digits : Error<
"\\%0 used with no following hex digits">;
def err_invalid_ucn_name : Error<
"'%0' is not a valid Unicode character name">;
def note_invalid_ucn_name_loose_matching : Note<
"characters names in unicode escape sequences are sensitive to case and whitespaces">;
def warn_ucn_escape_no_digits : Warning<
"\\%0 used with no following hex digits; "
"treating as '\\' followed by identifier">, InGroup<Unicode>;
def err_ucn_escape_incomplete : Error<
"incomplete universal character name">;
def warn_delimited_ucn_incomplete : Warning<
"incomplete delimited universal character name; "
"treating as '\\' 'u' '{' identifier">, InGroup<Unicode>;
"treating as '\\' '%0' '{' identifier">, InGroup<Unicode>;
def warn_delimited_ucn_empty : Warning<
"empty delimited universal character name; "
"treating as '\\' 'u' '{' '}'">, InGroup<Unicode>;
"treating as '\\' '%0' '{' '}'">, InGroup<Unicode>;
def warn_ucn_escape_incomplete : Warning<
"incomplete universal character name; "
"treating as '\\' followed by identifier">, InGroup<Unicode>;
Expand Down
5 changes: 5 additions & 0 deletions clang/include/clang/Lex/Lexer.h
Expand Up @@ -749,6 +749,11 @@ class Lexer : public PreprocessorLexer {
void codeCompleteIncludedFile(const char *PathStart,
const char *CompletionPoint, bool IsAngled);

llvm::Optional<uint32_t>
tryReadNumericUCN(const char *&StartPtr, const char *SlashLoc, Token *Result);
llvm::Optional<uint32_t> tryReadNamedUCN(const char *&StartPtr,
const char *SlashLoc, Token *Result);

/// Read a universal character name.
///
/// \param StartPtr The position in the source buffer after the initial '\'.
Expand Down
137 changes: 121 additions & 16 deletions clang/lib/Lex/Lexer.cpp
Expand Up @@ -37,6 +37,7 @@
#include "llvm/Support/MathExtras.h"
#include "llvm/Support/MemoryBufferRef.h"
#include "llvm/Support/NativeFormatting.h"
#include "llvm/Support/Unicode.h"
#include "llvm/Support/UnicodeCharRanges.h"
#include <algorithm>
#include <cassert>
Expand Down Expand Up @@ -3114,27 +3115,28 @@ bool Lexer::isCodeCompletionPoint(const char *CurPtr) const {
return false;
}

uint32_t Lexer::tryReadUCN(const char *&StartPtr, const char *SlashLoc,
Token *Result) {
llvm::Optional<uint32_t> Lexer::tryReadNumericUCN(const char *&StartPtr,
const char *SlashLoc,
Token *Result) {
unsigned CharSize;
char Kind = getCharAndSize(StartPtr, CharSize);
bool Delimited = false;
bool FoundEndDelimiter = false;
unsigned Count = 0;
bool Diagnose = Result && !isLexingRawMode();
assert((Kind == 'u' || Kind == 'U') && "expected a UCN");

unsigned NumHexDigits;
if (Kind == 'u')
NumHexDigits = 4;
else if (Kind == 'U')
NumHexDigits = 8;
else
return 0;

bool Delimited = false;
bool FoundEndDelimiter = false;
unsigned Count = 0;
bool Diagnose = Result && !isLexingRawMode();

if (!LangOpts.CPlusPlus && !LangOpts.C99) {
if (Diagnose)
Diag(SlashLoc, diag::warn_ucn_not_valid_in_c89);
return 0;
return {};
}

const char *CurPtr = StartPtr + CharSize;
Expand All @@ -3161,14 +3163,14 @@ uint32_t Lexer::tryReadUCN(const char *&StartPtr, const char *SlashLoc,
break;
if (Diagnose)
Diag(BufferPtr, diag::warn_delimited_ucn_incomplete)
<< StringRef(&C, 1);
return 0;
<< StringRef(KindLoc, 1);
return {};
}

if (CodePoint & 0xF000'0000) {
if (Diagnose)
Diag(KindLoc, diag::err_escape_too_large) << 0;
return 0;
return {};
}

CodePoint <<= 4;
Expand All @@ -3182,7 +3184,13 @@ uint32_t Lexer::tryReadUCN(const char *&StartPtr, const char *SlashLoc,
Diag(StartPtr, FoundEndDelimiter ? diag::warn_delimited_ucn_empty
: diag::warn_ucn_escape_no_digits)
<< StringRef(KindLoc, 1);
return 0;
return {};
}

if (Delimited && Kind == 'U') {
if (Diagnose)
Diag(StartPtr, diag::err_hex_escape_no_digits) << StringRef(KindLoc, 1);
return {};
}

if (!Delimited && Count != NumHexDigits) {
Expand All @@ -3195,15 +3203,14 @@ uint32_t Lexer::tryReadUCN(const char *&StartPtr, const char *SlashLoc,
<< FixItHint::CreateReplacement(URange, "u");
}
}
return 0;
return {};
}

if (Delimited && PP) {
Diag(BufferPtr, diag::ext_delimited_escape_sequence);
Diag(BufferPtr, diag::ext_delimited_escape_sequence) << /*delimited*/ 0;
}

if (Result) {
Result->setFlag(Token::HasUCN);
if (CurPtr - StartPtr == (ptrdiff_t)(Count + 2 + (Delimited ? 2 : 0)))
StartPtr = CurPtr;
else
Expand All @@ -3212,6 +3219,104 @@ uint32_t Lexer::tryReadUCN(const char *&StartPtr, const char *SlashLoc,
} else {
StartPtr = CurPtr;
}
return CodePoint;
}

llvm::Optional<uint32_t> Lexer::tryReadNamedUCN(const char *&StartPtr,
const char *, Token *Result) {
unsigned CharSize;
bool Diagnose = Result && !isLexingRawMode();

char C = getCharAndSize(StartPtr, CharSize);
assert(C == 'N' && "expected \\N{...}");

const char *CurPtr = StartPtr + CharSize;
const char *KindLoc = &CurPtr[-1];

C = getCharAndSize(CurPtr, CharSize);
if (C != '{') {
if (Diagnose)
Diag(StartPtr, diag::warn_ucn_escape_incomplete);
return {};
}
CurPtr += CharSize;

bool FoundEndDelimiter = false;
bool Invalid = false;
llvm::SmallVector<char, 30> Buffer;
while (C) {
C = getCharAndSize(CurPtr, CharSize);
CurPtr += CharSize;
if (C == '}') {
FoundEndDelimiter = true;
break;
}

if (!isAlphanumeric(C) && C != '_' && C != '-' && C != ' ')
break;

if ((C < 'A' || C > 'Z') && !llvm::isDigit(C) && C != ' ' && C != '-') {
Invalid = true;
}
Buffer.push_back(C);
}

if (!FoundEndDelimiter || Buffer.empty()) {
if (Diagnose)
Diag(StartPtr, FoundEndDelimiter ? diag::warn_delimited_ucn_empty
: diag::warn_delimited_ucn_incomplete)
<< StringRef(KindLoc, 1);
return {};
}
llvm::Optional<char32_t> Res;

if (!Invalid)
Res = llvm::sys::unicode::nameToCodepointStrict(
StringRef(Buffer.data(), Buffer.size()));

if (!Res) {
if (Diagnose)
Diag(StartPtr, diag::err_invalid_ucn_name)
<< StringRef(Buffer.data(), Buffer.size());
return {};
}

if (Diagnose && PP) {
Diag(BufferPtr, diag::ext_delimited_escape_sequence) << /*named*/ 1;
}

if (Result) {
if (CurPtr - StartPtr == (ptrdiff_t)(Buffer.size() + 4))
StartPtr = CurPtr;
else
while (StartPtr != CurPtr)
(void)getAndAdvanceChar(StartPtr, *Result);
} else {
StartPtr = CurPtr;
}
return *Res;
}

uint32_t Lexer::tryReadUCN(const char *&StartPtr, const char *SlashLoc,
Token *Result) {

unsigned CharSize;
llvm::Optional<uint32_t> CodePointOpt;
char Kind = getCharAndSize(StartPtr, CharSize);
if (Kind == 'u' || Kind == 'U')
CodePointOpt = tryReadNumericUCN(StartPtr, SlashLoc, Result);

else if (Kind == 'N')
CodePointOpt = tryReadNamedUCN(StartPtr, SlashLoc, Result);

if (!CodePointOpt)
return 0;

uint32_t CodePoint = *CodePointOpt;

if (Result) {
Result->setFlag(Token::HasUCN);
}

// Don't apply C family restrictions to UCNs in assembly mode
if (LangOpts.AsmPreprocessor)
Expand Down

0 comments on commit 8f777e2

Please sign in to comment.