Skip to content

Commit

Permalink
Cleanup identifier parsing; NFC
Browse files Browse the repository at this point in the history
Rename methods to clearly signal when they only deal with ASCII,
simplify the parsing of identifier, and use start/continue instead of
head/body for consistency with Unicode terminology.
  • Loading branch information
cor3ntin authored and AaronBallman committed Sep 14, 2021
1 parent 9bbc0c1 commit 601102d
Show file tree
Hide file tree
Showing 24 changed files with 298 additions and 309 deletions.
2 changes: 1 addition & 1 deletion clang-tools-extra/clang-include-fixer/IncludeFixer.cpp
Expand Up @@ -245,7 +245,7 @@ clang::TypoCorrection IncludeFixerSemaSource::CorrectTypo(
// parent_path.
// FIXME: Don't rely on source text.
const char *End = Source.end();
while (isIdentifierBody(*End) || *End == ':')
while (isAsciiIdentifierContinue(*End) || *End == ':')
++End;

return std::string(Source.begin(), End);
Expand Down
2 changes: 1 addition & 1 deletion clang-tools-extra/clang-tidy/google/IntegerTypesCheck.cpp
Expand Up @@ -129,7 +129,7 @@ void IntegerTypesCheck::check(const MatchFinder::MatchResult &Result) {
const StringRef Port = "unsigned short port";
const char *Data = Result.SourceManager->getCharacterData(Loc);
if (!std::strncmp(Data, Port.data(), Port.size()) &&
!isIdentifierBody(Data[Port.size()]))
!isAsciiIdentifierContinue(Data[Port.size()]))
return;

std::string Replacement =
Expand Down
Expand Up @@ -464,7 +464,7 @@ void RenamerClangTidyCheck::check(const MatchFinder::MatchResult &Result) {
Failure.FixStatus = ShouldFixStatus::ConflictsWithKeyword;
else if (Ident->hasMacroDefinition())
Failure.FixStatus = ShouldFixStatus::ConflictsWithMacroDefinition;
} else if (!isValidIdentifier(Info.Fixup)) {
} else if (!isValidAsciiIdentifier(Info.Fixup)) {
Failure.FixStatus = ShouldFixStatus::FixInvalidIdentifier;
}

Expand Down
10 changes: 5 additions & 5 deletions clang-tools-extra/clangd/CodeComplete.cpp
Expand Up @@ -1842,14 +1842,14 @@ CompletionPrefix guessCompletionPrefix(llvm::StringRef Content,
CompletionPrefix Result;

// Consume the unqualified name. We only handle ASCII characters.
// isIdentifierBody will let us match "0invalid", but we don't mind.
while (!Rest.empty() && isIdentifierBody(Rest.back()))
// isAsciiIdentifierContinue will let us match "0invalid", but we don't mind.
while (!Rest.empty() && isAsciiIdentifierContinue(Rest.back()))
Rest = Rest.drop_back();
Result.Name = Content.slice(Rest.size(), Offset);

// Consume qualifiers.
while (Rest.consume_back("::") && !Rest.endswith(":")) // reject ::::
while (!Rest.empty() && isIdentifierBody(Rest.back()))
while (!Rest.empty() && isAsciiIdentifierContinue(Rest.back()))
Rest = Rest.drop_back();
Result.Qualifier =
Content.slice(Rest.size(), Result.Name.begin() - Content.begin());
Expand Down Expand Up @@ -2057,8 +2057,8 @@ bool allowImplicitCompletion(llvm::StringRef Content, unsigned Offset) {
return true;

// Complete words. Give non-ascii characters the benefit of the doubt.
return !Content.empty() &&
(isIdentifierBody(Content.back()) || !llvm::isASCII(Content.back()));
return !Content.empty() && (isAsciiIdentifierContinue(Content.back()) ||
!llvm::isASCII(Content.back()));
}

} // namespace clangd
Expand Down
4 changes: 2 additions & 2 deletions clang-tools-extra/clangd/SourceCode.cpp
Expand Up @@ -945,9 +945,9 @@ llvm::Optional<SpelledWord> SpelledWord::touching(SourceLocation SpelledLoc,
if (Invalid)
return llvm::None;
unsigned B = Offset, E = Offset;
while (B > 0 && isIdentifierBody(Code[B - 1]))
while (B > 0 && isAsciiIdentifierContinue(Code[B - 1]))
--B;
while (E < Code.size() && isIdentifierBody(Code[E]))
while (E < Code.size() && isAsciiIdentifierContinue(Code[E]))
++E;
if (B == E)
return llvm::None;
Expand Down
4 changes: 2 additions & 2 deletions clang-tools-extra/clangd/refactor/Rename.cpp
Expand Up @@ -478,10 +478,10 @@ static bool mayBeValidIdentifier(llvm::StringRef Ident) {
// We don't check all the rules for non-ascii characters (most are allowed).
bool AllowDollar = true; // lenient
if (llvm::isASCII(Ident.front()) &&
!isIdentifierHead(Ident.front(), AllowDollar))
!isAsciiIdentifierStart(Ident.front(), AllowDollar))
return false;
for (char C : Ident) {
if (llvm::isASCII(C) && !isIdentifierBody(C, AllowDollar))
if (llvm::isASCII(C) && !isAsciiIdentifierContinue(C, AllowDollar))
return false;
}
return true;
Expand Down
16 changes: 8 additions & 8 deletions clang/include/clang/Basic/CharInfo.h
Expand Up @@ -50,8 +50,8 @@ LLVM_READNONE inline bool isASCII(uint32_t c) { return c <= 127; }

/// Returns true if this is a valid first character of a C identifier,
/// which is [a-zA-Z_].
LLVM_READONLY inline bool isIdentifierHead(unsigned char c,
bool AllowDollar = false) {
LLVM_READONLY inline bool isAsciiIdentifierStart(unsigned char c,
bool AllowDollar = false) {
using namespace charinfo;
if (InfoTable[c] & (CHAR_UPPER|CHAR_LOWER|CHAR_UNDER))
return true;
Expand All @@ -60,8 +60,8 @@ LLVM_READONLY inline bool isIdentifierHead(unsigned char c,

/// Returns true if this is a body character of a C identifier,
/// which is [a-zA-Z0-9_].
LLVM_READONLY inline bool isIdentifierBody(unsigned char c,
bool AllowDollar = false) {
LLVM_READONLY inline bool isAsciiIdentifierContinue(unsigned char c,
bool AllowDollar = false) {
using namespace charinfo;
if (InfoTable[c] & (CHAR_UPPER|CHAR_LOWER|CHAR_DIGIT|CHAR_UNDER))
return true;
Expand Down Expand Up @@ -186,13 +186,13 @@ LLVM_READONLY inline char toUppercase(char c) {
///
/// Note that this is a very simple check; it does not accept UCNs as valid
/// identifier characters.
LLVM_READONLY inline bool isValidIdentifier(StringRef S,
bool AllowDollar = false) {
if (S.empty() || !isIdentifierHead(S[0], AllowDollar))
LLVM_READONLY inline bool isValidAsciiIdentifier(StringRef S,
bool AllowDollar = false) {
if (S.empty() || !isAsciiIdentifierStart(S[0], AllowDollar))
return false;

for (StringRef::iterator I = S.begin(), E = S.end(); I != E; ++I)
if (!isIdentifierBody(*I, AllowDollar))
if (!isAsciiIdentifierContinue(*I, AllowDollar))
return false;

return true;
Expand Down
14 changes: 8 additions & 6 deletions clang/include/clang/Lex/Lexer.h
Expand Up @@ -536,7 +536,8 @@ class Lexer : public PreprocessorLexer {
bool SkipTrailingWhitespaceAndNewLine);

/// Returns true if the given character could appear in an identifier.
static bool isIdentifierBodyChar(char c, const LangOptions &LangOpts);
static bool isAsciiIdentifierContinueChar(char c,
const LangOptions &LangOpts);

/// Checks whether new line pointed by Str is preceded by escape
/// sequence.
Expand Down Expand Up @@ -573,10 +574,7 @@ class Lexer : public PreprocessorLexer {

bool CheckUnicodeWhitespace(Token &Result, uint32_t C, const char *CurPtr);

/// Given that a token begins with the Unicode character \p C, figure out
/// what kind of token it is and dispatch to the appropriate lexing helper
/// function.
bool LexUnicode(Token &Result, uint32_t C, const char *CurPtr);
bool LexUnicodeIdentifierStart(Token &Result, uint32_t C, const char *CurPtr);

/// FormTokenWithChars - When we lex a token, we have identified a span
/// starting at BufferPtr, going to TokEnd that forms the token. This method
Expand Down Expand Up @@ -701,7 +699,11 @@ class Lexer : public PreprocessorLexer {
bool IsStringLiteral);

// Helper functions to lex the remainder of a token of the specific type.
bool LexIdentifier (Token &Result, const char *CurPtr);

// This function handles both ASCII and Unicode identifiers after
// the first codepoint of the identifyier has been parsed.
bool LexIdentifierContinue(Token &Result, const char *CurPtr);

bool LexNumericConstant (Token &Result, const char *CurPtr);
bool LexStringLiteral (Token &Result, const char *CurPtr,
tok::TokenKind Kind);
Expand Down
2 changes: 1 addition & 1 deletion clang/lib/ARCMigrate/ObjCMT.cpp
Expand Up @@ -1144,7 +1144,7 @@ static bool AttributesMatch(const Decl *Decl1, const Decl *Decl2,

static bool IsValidIdentifier(ASTContext &Ctx,
const char *Name) {
if (!isIdentifierHead(Name[0]))
if (!isAsciiIdentifierStart(Name[0]))
return false;
std::string NameString = Name;
NameString[0] = toLowercase(NameString[0]);
Expand Down
3 changes: 2 additions & 1 deletion clang/lib/ARCMigrate/TransUnbridgedCasts.cpp
Expand Up @@ -253,7 +253,8 @@ class UnbridgedCastRewriter : public RecursiveASTVisitor<UnbridgedCastRewriter>{

SourceManager &SM = Pass.Ctx.getSourceManager();
char PrevChar = *SM.getCharacterData(InsertLoc.getLocWithOffset(-1));
if (Lexer::isIdentifierBodyChar(PrevChar, Pass.Ctx.getLangOpts()))
if (Lexer::isAsciiIdentifierContinueChar(PrevChar,
Pass.Ctx.getLangOpts()))
BridgeCall += ' ';

if (Kind == OBC_BridgeTransfer)
Expand Down
2 changes: 1 addition & 1 deletion clang/lib/AST/MicrosoftMangle.cpp
Expand Up @@ -3884,7 +3884,7 @@ void MicrosoftMangleContextImpl::mangleStringLiteral(const StringLiteral *SL,
// - ?[A-Z]: The range from \xc1 to \xda.
// - ?[0-9]: The set of [,/\:. \n\t'-].
// - ?$XX: A fallback which maps nibbles.
if (isIdentifierBody(Byte, /*AllowDollar=*/true)) {
if (isAsciiIdentifierContinue(Byte, /*AllowDollar=*/true)) {
Mangler.getStream() << Byte;
} else if (isLetter(Byte & 0x7f)) {
Mangler.getStream() << '?' << static_cast<char>(Byte & 0x7f);
Expand Down
2 changes: 1 addition & 1 deletion clang/lib/Basic/Module.cpp
Expand Up @@ -203,7 +203,7 @@ static void printModuleId(raw_ostream &OS, InputIter Begin, InputIter End,
OS << ".";

StringRef Name = getModuleNameFromComponent(*It);
if (!AllowStringLiterals || isValidIdentifier(Name))
if (!AllowStringLiterals || isValidAsciiIdentifier(Name))
OS << Name;
else {
OS << '"';
Expand Down
4 changes: 2 additions & 2 deletions clang/lib/Edit/EditedSource.cpp
Expand Up @@ -314,8 +314,8 @@ bool EditedSource::commit(const Commit &commit) {
static bool canBeJoined(char left, char right, const LangOptions &LangOpts) {
// FIXME: Should use TokenConcatenation to make sure we don't allow stuff like
// making two '<' adjacent.
return !(Lexer::isIdentifierBodyChar(left, LangOpts) &&
Lexer::isIdentifierBodyChar(right, LangOpts));
return !(Lexer::isAsciiIdentifierContinueChar(left, LangOpts) &&
Lexer::isAsciiIdentifierContinueChar(right, LangOpts));
}

/// Returns true if it is ok to eliminate the trailing whitespace between
Expand Down
4 changes: 2 additions & 2 deletions clang/lib/Frontend/LayoutOverrideSource.cpp
Expand Up @@ -16,11 +16,11 @@ using namespace clang;

/// Parse a simple identifier.
static std::string parseName(StringRef S) {
if (S.empty() || !isIdentifierHead(S[0]))
if (S.empty() || !isAsciiIdentifierStart(S[0]))
return "";

unsigned Offset = 1;
while (Offset < S.size() && isIdentifierBody(S[Offset]))
while (Offset < S.size() && isAsciiIdentifierContinue(S[Offset]))
++Offset;

return S.substr(0, Offset).str();
Expand Down
2 changes: 1 addition & 1 deletion clang/lib/Frontend/Rewrite/FrontendActions.cpp
Expand Up @@ -231,7 +231,7 @@ class RewriteIncludesAction::RewriteImportsListener : public ASTReaderListener {
assert(OS && "loaded module file after finishing rewrite action?");

(*OS) << "#pragma clang module build ";
if (isValidIdentifier(MF->ModuleName))
if (isValidAsciiIdentifier(MF->ModuleName))
(*OS) << MF->ModuleName;
else {
(*OS) << '"';
Expand Down
34 changes: 17 additions & 17 deletions clang/lib/Lex/DependencyDirectivesSourceMinimizer.cpp
Expand Up @@ -131,17 +131,17 @@ LLVM_NODISCARD static bool isRawStringLiteral(const char *First,
--Current;
if (*Current != 'R')
return false;
if (First == Current || !isIdentifierBody(*--Current))
if (First == Current || !isAsciiIdentifierContinue(*--Current))
return true;

// Check for a prefix of "u", "U", or "L".
if (*Current == 'u' || *Current == 'U' || *Current == 'L')
return First == Current || !isIdentifierBody(*--Current);
return First == Current || !isAsciiIdentifierContinue(*--Current);

// Check for a prefix of "u8".
if (*Current != '8' || First == Current || *Current-- != 'u')
return false;
return First == Current || !isIdentifierBody(*--Current);
return First == Current || !isAsciiIdentifierContinue(*--Current);
}

static void skipRawString(const char *&First, const char *const End) {
Expand Down Expand Up @@ -319,7 +319,7 @@ static bool isQuoteCppDigitSeparator(const char *const Start,
if (!isPreprocessingNumberBody(Prev))
return false;
// The next character should be a valid identifier body character.
return (Cur + 1) < End && isIdentifierBody(*(Cur + 1));
return (Cur + 1) < End && isAsciiIdentifierContinue(*(Cur + 1));
}

static void skipLine(const char *&First, const char *const End) {
Expand Down Expand Up @@ -484,7 +484,7 @@ void Minimizer::printAdjacentModuleNameParts(const char *&First,
const char *Last = First;
do
++Last;
while (Last != End && (isIdentifierBody(*Last) || *Last == '.'));
while (Last != End && (isAsciiIdentifierContinue(*Last) || *Last == '.'));
append(First, Last);
First = Last;
}
Expand All @@ -507,7 +507,7 @@ bool Minimizer::printAtImportBody(const char *&First, const char *const End) {
}

// Don't handle macro expansions inside @import for now.
if (!isIdentifierBody(*First) && *First != '.')
if (!isAsciiIdentifierContinue(*First) && *First != '.')
return true;

printAdjacentModuleNameParts(First, End);
Expand All @@ -524,9 +524,9 @@ void Minimizer::printDirectiveBody(const char *&First, const char *const End) {

LLVM_NODISCARD static const char *lexRawIdentifier(const char *First,
const char *const End) {
assert(isIdentifierBody(*First) && "invalid identifer");
assert(isAsciiIdentifierContinue(*First) && "invalid identifer");
const char *Last = First + 1;
while (Last != End && isIdentifierBody(*Last))
while (Last != End && isAsciiIdentifierContinue(*Last))
++Last;
return Last;
}
Expand All @@ -540,7 +540,7 @@ getIdentifierContinuation(const char *First, const char *const End) {
skipNewline(First, End);
if (First == End)
return nullptr;
return isIdentifierBody(First[0]) ? First : nullptr;
return isAsciiIdentifierContinue(First[0]) ? First : nullptr;
}

Minimizer::IdInfo Minimizer::lexIdentifier(const char *First,
Expand Down Expand Up @@ -569,7 +569,7 @@ void Minimizer::printAdjacentMacroArgs(const char *&First,
do
++Last;
while (Last != End &&
(isIdentifierBody(*Last) || *Last == '.' || *Last == ','));
(isAsciiIdentifierContinue(*Last) || *Last == '.' || *Last == ','));
append(First, Last);
First = Last;
}
Expand All @@ -588,7 +588,7 @@ bool Minimizer::printMacroArgs(const char *&First, const char *const End) {
}

// This is intentionally fairly liberal.
if (!(isIdentifierBody(*First) || *First == '.' || *First == ','))
if (!(isAsciiIdentifierContinue(*First) || *First == '.' || *First == ','))
return true;

printAdjacentMacroArgs(First, End);
Expand All @@ -602,7 +602,7 @@ bool Minimizer::printMacroArgs(const char *&First, const char *const End) {
bool Minimizer::isNextIdentifier(StringRef Id, const char *&First,
const char *const End) {
skipWhitespace(First, End);
if (First == End || !isIdentifierHead(*First))
if (First == End || !isAsciiIdentifierStart(*First))
return false;

IdInfo FoundId = lexIdentifier(First, End);
Expand Down Expand Up @@ -639,7 +639,7 @@ bool Minimizer::lexModule(const char *&First, const char *const End) {
if (Id.Name == "export") {
Export = true;
skipWhitespace(First, End);
if (!isIdentifierBody(*First)) {
if (!isAsciiIdentifierContinue(*First)) {
skipLine(First, End);
return false;
}
Expand All @@ -663,7 +663,7 @@ bool Minimizer::lexModule(const char *&First, const char *const End) {
case '"':
break;
default:
if (!isIdentifierBody(*First)) {
if (!isAsciiIdentifierContinue(*First)) {
skipLine(First, End);
return false;
}
Expand All @@ -690,7 +690,7 @@ bool Minimizer::lexDefine(const char *&First, const char *const End) {
append("#define ");
skipWhitespace(First, End);

if (!isIdentifierHead(*First))
if (!isAsciiIdentifierStart(*First))
return reportError(First, diag::err_pp_macro_not_identifier);

IdInfo Id = lexIdentifier(First, End);
Expand Down Expand Up @@ -722,7 +722,7 @@ bool Minimizer::lexDefine(const char *&First, const char *const End) {
bool Minimizer::lexPragma(const char *&First, const char *const End) {
// #pragma.
skipWhitespace(First, End);
if (First == End || !isIdentifierHead(*First))
if (First == End || !isAsciiIdentifierStart(*First))
return false;

IdInfo FoundId = lexIdentifier(First, End);
Expand Down Expand Up @@ -827,7 +827,7 @@ bool Minimizer::lexPPLine(const char *&First, const char *const End) {
if (First == End)
return reportError(First, diag::err_pp_expected_eol);

if (!isIdentifierHead(*First)) {
if (!isAsciiIdentifierStart(*First)) {
skipLine(First, End);
return false;
}
Expand Down

0 comments on commit 601102d

Please sign in to comment.