[Clang][C++23][WIP] P2071 Named universal character escapes

! Missing tests, some cleanup still needed. * Add a function in LLVM to map a name to a codepoint. This using a try to minimize memory usage, while allowing fast access. * Add an utility to regenerate this data. * Support named escape sequences with an extension warning. I have not yet dealt with C++23 conformance extension warning, Differential Revision: https://reviews.llvm.org/D123064
llvm · Apr 5, 2022 · 8f777e2 · 8f777e2
1 parent 6cf10b7
commit 8f777e2
Show file tree

Hide file tree

Showing 20 changed files with 21,520 additions and 60 deletions.
diff --git a/clang/include/clang/Basic/DiagnosticLexKinds.td b/clang/include/clang/Basic/DiagnosticLexKinds.td
@@ -128,7 +128,7 @@ def warn_utf8_symbol_zero_width : Warning<
   "some environments">, InGroup<DiagGroup<"unicode-zero-width">>;
 
 def ext_delimited_escape_sequence : Extension<
-  "delimited escape sequences are a Clang extension">,
+  "%select{delimited|named}0 escape sequences are a Clang extension">,
   InGroup<DiagGroup<"delimited-escape-sequence-extension">>;
 def err_delimited_escape_empty : Error<
   "delimited escape sequence cannot be empty">;
@@ -138,17 +138,21 @@ def err_delimited_escape_invalid : Error<
   "invalid digit '%0' in escape sequence">;
 def err_hex_escape_no_digits : Error<
   "\\%0 used with no following hex digits">;
+def err_invalid_ucn_name : Error<
+  "'%0' is not a valid Unicode character name">;
+def note_invalid_ucn_name_loose_matching : Note<
+  "characters names in unicode escape sequences are sensitive to case and whitespaces">;
 def warn_ucn_escape_no_digits : Warning<
   "\\%0 used with no following hex digits; "
   "treating as '\\' followed by identifier">, InGroup<Unicode>;
 def err_ucn_escape_incomplete : Error<
   "incomplete universal character name">;
 def warn_delimited_ucn_incomplete : Warning<
   "incomplete delimited universal character name; "
-  "treating as '\\' 'u' '{' identifier">, InGroup<Unicode>;
+  "treating as '\\' '%0' '{' identifier">, InGroup<Unicode>;
 def warn_delimited_ucn_empty : Warning<
   "empty delimited universal character name; "
-  "treating as '\\' 'u' '{' '}'">, InGroup<Unicode>;
+  "treating as '\\' '%0' '{' '}'">, InGroup<Unicode>;
 def warn_ucn_escape_incomplete : Warning<
   "incomplete universal character name; "
   "treating as '\\' followed by identifier">, InGroup<Unicode>;

diff --git a/clang/include/clang/Lex/Lexer.h b/clang/include/clang/Lex/Lexer.h
@@ -749,6 +749,11 @@ class Lexer : public PreprocessorLexer {
   void codeCompleteIncludedFile(const char *PathStart,
                                 const char *CompletionPoint, bool IsAngled);
 
+  llvm::Optional<uint32_t>
+  tryReadNumericUCN(const char *&StartPtr, const char *SlashLoc, Token *Result);
+  llvm::Optional<uint32_t> tryReadNamedUCN(const char *&StartPtr,
+                                           const char *SlashLoc, Token *Result);
+
   /// Read a universal character name.
   ///
   /// \param StartPtr The position in the source buffer after the initial '\'.

diff --git a/clang/lib/Lex/Lexer.cpp b/clang/lib/Lex/Lexer.cpp
@@ -37,6 +37,7 @@
 #include "llvm/Support/MathExtras.h"
 #include "llvm/Support/MemoryBufferRef.h"
 #include "llvm/Support/NativeFormatting.h"
+#include "llvm/Support/Unicode.h"
 #include "llvm/Support/UnicodeCharRanges.h"
 #include <algorithm>
 #include <cassert>
@@ -3114,27 +3115,28 @@ bool Lexer::isCodeCompletionPoint(const char *CurPtr) const {
   return false;
 }
 
-uint32_t Lexer::tryReadUCN(const char *&StartPtr, const char *SlashLoc,
-                           Token *Result) {
+llvm::Optional<uint32_t> Lexer::tryReadNumericUCN(const char *&StartPtr,
+                                                  const char *SlashLoc,
+                                                  Token *Result) {
   unsigned CharSize;
   char Kind = getCharAndSize(StartPtr, CharSize);
-  bool Delimited = false;
-  bool FoundEndDelimiter = false;
-  unsigned Count = 0;
-  bool Diagnose = Result && !isLexingRawMode();
+  assert((Kind == 'u' || Kind == 'U') && "expected a UCN");
 
   unsigned NumHexDigits;
   if (Kind == 'u')
     NumHexDigits = 4;
   else if (Kind == 'U')
     NumHexDigits = 8;
-  else
-    return 0;
+
+  bool Delimited = false;
+  bool FoundEndDelimiter = false;
+  unsigned Count = 0;
+  bool Diagnose = Result && !isLexingRawMode();
 
   if (!LangOpts.CPlusPlus && !LangOpts.C99) {
     if (Diagnose)
       Diag(SlashLoc, diag::warn_ucn_not_valid_in_c89);
-    return 0;
+    return {};
   }
 
   const char *CurPtr = StartPtr + CharSize;
@@ -3161,14 +3163,14 @@ uint32_t Lexer::tryReadUCN(const char *&StartPtr, const char *SlashLoc,
         break;
       if (Diagnose)
         Diag(BufferPtr, diag::warn_delimited_ucn_incomplete)
-            << StringRef(&C, 1);
-      return 0;
+            << StringRef(KindLoc, 1);
+      return {};
     }
 
     if (CodePoint & 0xF000'0000) {
       if (Diagnose)
         Diag(KindLoc, diag::err_escape_too_large) << 0;
-      return 0;
+      return {};
     }
 
     CodePoint <<= 4;
@@ -3182,7 +3184,13 @@ uint32_t Lexer::tryReadUCN(const char *&StartPtr, const char *SlashLoc,
       Diag(StartPtr, FoundEndDelimiter ? diag::warn_delimited_ucn_empty
                                        : diag::warn_ucn_escape_no_digits)
           << StringRef(KindLoc, 1);
-    return 0;
+    return {};
+  }
+
+  if (Delimited && Kind == 'U') {
+    if (Diagnose)
+      Diag(StartPtr, diag::err_hex_escape_no_digits) << StringRef(KindLoc, 1);
+    return {};
   }
 
   if (!Delimited && Count != NumHexDigits) {
@@ -3195,15 +3203,14 @@ uint32_t Lexer::tryReadUCN(const char *&StartPtr, const char *SlashLoc,
             << FixItHint::CreateReplacement(URange, "u");
       }
     }
-    return 0;
+    return {};
   }
 
   if (Delimited && PP) {
-    Diag(BufferPtr, diag::ext_delimited_escape_sequence);
+    Diag(BufferPtr, diag::ext_delimited_escape_sequence) << /*delimited*/ 0;
   }
 
   if (Result) {
-    Result->setFlag(Token::HasUCN);
     if (CurPtr - StartPtr == (ptrdiff_t)(Count + 2 + (Delimited ? 2 : 0)))
       StartPtr = CurPtr;
     else
@@ -3212,6 +3219,104 @@ uint32_t Lexer::tryReadUCN(const char *&StartPtr, const char *SlashLoc,
   } else {
     StartPtr = CurPtr;
   }
+  return CodePoint;
+}
+
+llvm::Optional<uint32_t> Lexer::tryReadNamedUCN(const char *&StartPtr,
+                                                const char *, Token *Result) {
+  unsigned CharSize;
+  bool Diagnose = Result && !isLexingRawMode();
+
+  char C = getCharAndSize(StartPtr, CharSize);
+  assert(C == 'N' && "expected \\N{...}");
+
+  const char *CurPtr = StartPtr + CharSize;
+  const char *KindLoc = &CurPtr[-1];
+
+  C = getCharAndSize(CurPtr, CharSize);
+  if (C != '{') {
+    if (Diagnose)
+      Diag(StartPtr, diag::warn_ucn_escape_incomplete);
+    return {};
+  }
+  CurPtr += CharSize;
+
+  bool FoundEndDelimiter = false;
+  bool Invalid = false;
+  llvm::SmallVector<char, 30> Buffer;
+  while (C) {
+    C = getCharAndSize(CurPtr, CharSize);
+    CurPtr += CharSize;
+    if (C == '}') {
+      FoundEndDelimiter = true;
+      break;
+    }
+
+    if (!isAlphanumeric(C) && C != '_' && C != '-' && C != ' ')
+      break;
+
+    if ((C < 'A' || C > 'Z') && !llvm::isDigit(C) && C != ' ' && C != '-') {
+      Invalid = true;
+    }
+    Buffer.push_back(C);
+  }
+
+  if (!FoundEndDelimiter || Buffer.empty()) {
+    if (Diagnose)
+      Diag(StartPtr, FoundEndDelimiter ? diag::warn_delimited_ucn_empty
+                                       : diag::warn_delimited_ucn_incomplete)
+          << StringRef(KindLoc, 1);
+    return {};
+  }
+  llvm::Optional<char32_t> Res;
+
+  if (!Invalid)
+    Res = llvm::sys::unicode::nameToCodepointStrict(
+        StringRef(Buffer.data(), Buffer.size()));
+
+  if (!Res) {
+    if (Diagnose)
+      Diag(StartPtr, diag::err_invalid_ucn_name)
+          << StringRef(Buffer.data(), Buffer.size());
+    return {};
+  }
+
+  if (Diagnose && PP) {
+    Diag(BufferPtr, diag::ext_delimited_escape_sequence) << /*named*/ 1;
+  }
+
+  if (Result) {
+    if (CurPtr - StartPtr == (ptrdiff_t)(Buffer.size() + 4))
+      StartPtr = CurPtr;
+    else
+      while (StartPtr != CurPtr)
+        (void)getAndAdvanceChar(StartPtr, *Result);
+  } else {
+    StartPtr = CurPtr;
+  }
+  return *Res;
+}
+
+uint32_t Lexer::tryReadUCN(const char *&StartPtr, const char *SlashLoc,
+                           Token *Result) {
+
+  unsigned CharSize;
+  llvm::Optional<uint32_t> CodePointOpt;
+  char Kind = getCharAndSize(StartPtr, CharSize);
+  if (Kind == 'u' || Kind == 'U')
+    CodePointOpt = tryReadNumericUCN(StartPtr, SlashLoc, Result);
+
+  else if (Kind == 'N')
+    CodePointOpt = tryReadNamedUCN(StartPtr, SlashLoc, Result);
+
+  if (!CodePointOpt)
+    return 0;
+
+  uint32_t CodePoint = *CodePointOpt;
+
+  if (Result) {
+    Result->setFlag(Token::HasUCN);
+  }
 
   // Don't apply C family restrictions to UCNs in assembly mode
   if (LangOpts.AsmPreprocessor)