Skip to content

Commit

Permalink
[clang][deps] Teach dep directive scanner about _Pragma
Browse files Browse the repository at this point in the history
While we cannot handle `_Pragma` used inside macros, we can handle
this at the top level, and it some projects use the `_Pragma("once")`
spelling like that, which was causing spurious failures in the scanner.

Limitations
* Cannot handle #define ONCE _Pragma("once"), same issue as using
  @import in a macro -- ideally we should diagnose this in obvious cases
* Our LangOpts are currently fixed, so we are not handling u"" strings
  or R"()" strings that require C11/C++11.

rdar://108629982

Differential Revision: https://reviews.llvm.org/D149884
  • Loading branch information
benlangmuir committed May 9, 2023
1 parent ec77d1f commit ee8ed0b
Show file tree
Hide file tree
Showing 5 changed files with 271 additions and 41 deletions.
7 changes: 7 additions & 0 deletions clang/include/clang/Lex/Pragma.h
Original file line number Diff line number Diff line change
Expand Up @@ -123,6 +123,13 @@ class PragmaNamespace : public PragmaHandler {
PragmaNamespace *getIfNamespace() override { return this; }
};

/// Destringize a \c _Pragma("") string according to C11 6.10.9.1:
/// "The string literal is destringized by deleting any encoding prefix,
/// deleting the leading and trailing double-quotes, replacing each escape
/// sequence \" by a double-quote, and replacing each escape sequence \\ by a
/// single backslash."
void prepare_PragmaString(SmallVectorImpl<char> &StrVal);

} // namespace clang

#endif // LLVM_CLANG_LEX_PRAGMA_H
114 changes: 105 additions & 9 deletions clang/lib/Lex/DependencyDirectivesScanner.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -19,6 +19,7 @@
#include "clang/Basic/Diagnostic.h"
#include "clang/Lex/LexDiagnostic.h"
#include "clang/Lex/Lexer.h"
#include "clang/Lex/Pragma.h"
#include "llvm/ADT/ScopeExit.h"
#include "llvm/ADT/SmallString.h"
#include "llvm/ADT/StringMap.h"
Expand Down Expand Up @@ -72,6 +73,8 @@ struct Scanner {
// Set the lexer to use 'tok::at' for '@', instead of 'tok::unknown'.
LangOpts.ObjC = true;
LangOpts.LineComment = true;
// FIXME: we do not enable C11 or C++11, so we are missing u/u8/U"" and
// R"()" literals.
return LangOpts;
}

Expand All @@ -91,6 +94,10 @@ struct Scanner {
void skipLine(const char *&First, const char *const End);
void skipDirective(StringRef Name, const char *&First, const char *const End);

/// Returns the spelling of a string literal or identifier after performing
/// any processing needed to handle \c clang::Token::NeedsCleaning.
StringRef cleanStringIfNeeded(const dependency_directives_scan::Token &Tok);

/// Lexes next token and if it is identifier returns its string, otherwise
/// it skips the current line and returns \p std::nullopt.
///
Expand All @@ -112,13 +119,30 @@ struct Scanner {
const char *&First,
const char *const End);

/// Lexes next token and returns true iff it matches the kind \p K.
/// Otherwise it skips the current line and returns false.
///
/// In any case (whatever the token kind) \p First and the \p Lexer will
/// advance beyond the token.
[[nodiscard]] bool isNextTokenOrSkipLine(tok::TokenKind K, const char *&First,
const char *const End);

/// Lexes next token and if it is string literal, returns its string.
/// Otherwise, it skips the current line and returns \p std::nullopt.
///
/// In any case (whatever the token kind) \p First and the \p Lexer will
/// advance beyond the token.
[[nodiscard]] std::optional<StringRef>
tryLexStringLiteralOrSkipLine(const char *&First, const char *const End);

[[nodiscard]] bool scanImpl(const char *First, const char *const End);
[[nodiscard]] bool lexPPLine(const char *&First, const char *const End);
[[nodiscard]] bool lexAt(const char *&First, const char *const End);
[[nodiscard]] bool lexModule(const char *&First, const char *const End);
[[nodiscard]] bool lexDefine(const char *HashLoc, const char *&First,
const char *const End);
[[nodiscard]] bool lexPragma(const char *&First, const char *const End);
[[nodiscard]] bool lex_Pragma(const char *&First, const char *const End);
[[nodiscard]] bool lexEndif(const char *&First, const char *const End);
[[nodiscard]] bool lexDefault(DirectiveKind Kind, const char *&First,
const char *const End);
Expand Down Expand Up @@ -525,22 +549,18 @@ void Scanner::lexPPDirectiveBody(const char *&First, const char *const End) {
}
}

[[nodiscard]] std::optional<StringRef>
Scanner::tryLexIdentifierOrSkipLine(const char *&First, const char *const End) {
const dependency_directives_scan::Token &Tok = lexToken(First, End);
if (Tok.isNot(tok::raw_identifier)) {
if (!Tok.is(tok::eod))
skipLine(First, End);
return std::nullopt;
}

StringRef
Scanner::cleanStringIfNeeded(const dependency_directives_scan::Token &Tok) {
bool NeedsCleaning = Tok.Flags & clang::Token::NeedsCleaning;
if (LLVM_LIKELY(!NeedsCleaning))
return Input.slice(Tok.Offset, Tok.getEnd());

SmallString<64> Spelling;
Spelling.resize(Tok.Length);

// FIXME: C++11 raw string literals need special handling (see getSpellingSlow
// in the Lexer). Currently we cannot see them due to our LangOpts.

unsigned SpellingLength = 0;
const char *BufPtr = Input.begin() + Tok.Offset;
const char *AfterIdent = Input.begin() + Tok.getEnd();
Expand All @@ -555,6 +575,18 @@ Scanner::tryLexIdentifierOrSkipLine(const char *&First, const char *const End) {
.first->first();
}

std::optional<StringRef>
Scanner::tryLexIdentifierOrSkipLine(const char *&First, const char *const End) {
const dependency_directives_scan::Token &Tok = lexToken(First, End);
if (Tok.isNot(tok::raw_identifier)) {
if (!Tok.is(tok::eod))
skipLine(First, End);
return std::nullopt;
}

return cleanStringIfNeeded(Tok);
}

StringRef Scanner::lexIdentifier(const char *&First, const char *const End) {
std::optional<StringRef> Id = tryLexIdentifierOrSkipLine(First, End);
assert(Id && "expected identifier token");
Expand All @@ -572,6 +604,28 @@ bool Scanner::isNextIdentifierOrSkipLine(StringRef Id, const char *&First,
return false;
}

bool Scanner::isNextTokenOrSkipLine(tok::TokenKind K, const char *&First,
const char *const End) {
const dependency_directives_scan::Token &Tok = lexToken(First, End);
if (Tok.is(K))
return true;
skipLine(First, End);
return false;
}

std::optional<StringRef>
Scanner::tryLexStringLiteralOrSkipLine(const char *&First,
const char *const End) {
const dependency_directives_scan::Token &Tok = lexToken(First, End);
if (!tok::isStringLiteral(Tok.Kind)) {
if (!Tok.is(tok::eod))
skipLine(First, End);
return std::nullopt;
}

return cleanStringIfNeeded(Tok);
}

bool Scanner::lexAt(const char *&First, const char *const End) {
// Handle "@import".

Expand Down Expand Up @@ -629,6 +683,41 @@ bool Scanner::lexModule(const char *&First, const char *const End) {
return lexModuleDirectiveBody(Kind, First, End);
}

bool Scanner::lex_Pragma(const char *&First, const char *const End) {
if (!isNextTokenOrSkipLine(tok::l_paren, First, End))
return false;

std::optional<StringRef> Str = tryLexStringLiteralOrSkipLine(First, End);

if (!Str || !isNextTokenOrSkipLine(tok::r_paren, First, End))
return false;

SmallString<64> Buffer(*Str);
prepare_PragmaString(Buffer);

// Use a new scanner instance since the tokens will be inside the allocated
// string. We should already have captured all the relevant tokens in the
// current scanner.
SmallVector<dependency_directives_scan::Token> DiscardTokens;
const char *Begin = Buffer.c_str();
Scanner PragmaScanner{StringRef(Begin, Buffer.size()), DiscardTokens, Diags,
InputSourceLoc};

PragmaScanner.TheLexer.setParsingPreprocessorDirective(true);
if (PragmaScanner.lexPragma(Begin, Buffer.end()))
return true;

DirectiveKind K = PragmaScanner.topDirective();
if (K == pp_none) {
skipLine(First, End);
return false;
}

assert(Begin == Buffer.end());
pushDirective(K);
return false;
}

bool Scanner::lexPragma(const char *&First, const char *const End) {
std::optional<StringRef> FoundId = tryLexIdentifierOrSkipLine(First, End);
if (!FoundId)
Expand Down Expand Up @@ -713,6 +802,7 @@ static bool isStartOfRelevantLine(char First) {
case 'i':
case 'e':
case 'm':
case '_':
return true;
}
return false;
Expand Down Expand Up @@ -749,6 +839,12 @@ bool Scanner::lexPPLine(const char *&First, const char *const End) {
if (*First == 'i' || *First == 'e' || *First == 'm')
return lexModule(First, End);

if (*First == '_') {
if (isNextIdentifierOrSkipLine("_Pragma", First, End))
return lex_Pragma(First, End);
return false;
}

// Handle preprocessing directives.

TheLexer.setParsingPreprocessorDirective(true);
Expand Down
69 changes: 40 additions & 29 deletions clang/lib/Lex/Pragma.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -262,17 +262,48 @@ void Preprocessor::Handle_Pragma(Token &Tok) {

SourceLocation RParenLoc = Tok.getLocation();
bool Invalid = false;
std::string StrVal = getSpelling(StrTok, &Invalid);
SmallString<64> StrVal;
StrVal.resize(StrTok.getLength());
StringRef StrValRef = getSpelling(StrTok, StrVal, &Invalid);
if (Invalid) {
Diag(PragmaLoc, diag::err__Pragma_malformed);
return;
}

// The _Pragma is lexically sound. Destringize according to C11 6.10.9.1:
// "The string literal is destringized by deleting any encoding prefix,
// deleting the leading and trailing double-quotes, replacing each escape
// sequence \" by a double-quote, and replacing each escape sequence \\ by a
// single backslash."
assert(StrValRef.size() <= StrVal.size());

// If the token was spelled somewhere else, copy it.
if (StrValRef.begin() != StrVal.begin())
StrVal.assign(StrValRef);
// Truncate if necessary.
else if (StrValRef.size() != StrVal.size())
StrVal.resize(StrValRef.size());

// The _Pragma is lexically sound. Destringize according to C11 6.10.9.1.
prepare_PragmaString(StrVal);

// Plop the string (including the newline and trailing null) into a buffer
// where we can lex it.
Token TmpTok;
TmpTok.startToken();
CreateString(StrVal, TmpTok);
SourceLocation TokLoc = TmpTok.getLocation();

// Make and enter a lexer object so that we lex and expand the tokens just
// like any others.
Lexer *TL = Lexer::Create_PragmaLexer(TokLoc, PragmaLoc, RParenLoc,
StrVal.size(), *this);

EnterSourceFileWithLexer(TL, nullptr);

// With everything set up, lex this as a #pragma directive.
HandlePragmaDirective({PIK__Pragma, PragmaLoc});

// Finally, return whatever came after the pragma directive.
return Lex(Tok);
}

void clang::prepare_PragmaString(SmallVectorImpl<char> &StrVal) {
if (StrVal[0] == 'L' || StrVal[0] == 'U' ||
(StrVal[0] == 'u' && StrVal[1] != '8'))
StrVal.erase(StrVal.begin());
Expand All @@ -296,8 +327,8 @@ void Preprocessor::Handle_Pragma(Token &Tok) {

// Remove 'R " d-char-sequence' and 'd-char-sequence "'. We'll replace the
// parens below.
StrVal.erase(0, 2 + NumDChars);
StrVal.erase(StrVal.size() - 1 - NumDChars);
StrVal.erase(StrVal.begin(), StrVal.begin() + 2 + NumDChars);
StrVal.erase(StrVal.end() - 1 - NumDChars, StrVal.end());
} else {
assert(StrVal[0] == '"' && StrVal[StrVal.size()-1] == '"' &&
"Invalid string token!");
Expand All @@ -319,27 +350,7 @@ void Preprocessor::Handle_Pragma(Token &Tok) {
StrVal[0] = ' ';

// Replace the terminating quote with a \n.
StrVal[StrVal.size()-1] = '\n';

// Plop the string (including the newline and trailing null) into a buffer
// where we can lex it.
Token TmpTok;
TmpTok.startToken();
CreateString(StrVal, TmpTok);
SourceLocation TokLoc = TmpTok.getLocation();

// Make and enter a lexer object so that we lex and expand the tokens just
// like any others.
Lexer *TL = Lexer::Create_PragmaLexer(TokLoc, PragmaLoc, RParenLoc,
StrVal.size(), *this);

EnterSourceFileWithLexer(TL, nullptr);

// With everything set up, lex this as a #pragma directive.
HandlePragmaDirective({PIK__Pragma, PragmaLoc});

// Finally, return whatever came after the pragma directive.
return Lex(Tok);
StrVal[StrVal.size() - 1] = '\n';
}

/// HandleMicrosoft__pragma - Like Handle_Pragma except the pragma text
Expand Down
24 changes: 24 additions & 0 deletions clang/test/ClangScanDeps/_Pragma-once.c
Original file line number Diff line number Diff line change
@@ -0,0 +1,24 @@
// Test scanning deps works with _Pragma syntax when not inside a macro.

// RUN: rm -rf %t
// RUN: split-file %s %t
// RUN: sed "s|DIR|%/t|g" %t/cdb.json.template > %t/cdb.json

// RUN: clang-scan-deps -compilation-database %t/cdb.json -j 1

//--- cdb.json.template
[{
"directory": "DIR",
"command": "clang -fsyntax-only DIR/tu.c",
"file": "DIR/tu.c"
}]

//--- a.h
_Pragma("once")
#include "b.h"

//--- b.h
#include "a.h"

//--- tu.c
#include "a.h"

0 comments on commit ee8ed0b

Please sign in to comment.