-
Notifications
You must be signed in to change notification settings - Fork 11.6k
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
[Syntax] Introduce TokenBuffer, start clangToolingSyntax library
Summary: TokenBuffer stores the list of tokens for a file obtained after preprocessing. This is a base building block for syntax trees, see [1] for the full proposal on syntax trees. This commits also starts a new sub-library of ClangTooling, which would be the home for the syntax trees and syntax-tree-based refactoring utilities. [1]: https://lists.llvm.org/pipermail/cfe-dev/2019-February/061414.html Reviewers: gribozavr, sammccall Reviewed By: sammccall Subscribers: mgrang, riccibruno, Eugene.Zelenko, mgorny, jdoerfert, cfe-commits Tags: #clang Differential Revision: https://reviews.llvm.org/D59887 llvm-svn: 361148
- Loading branch information
1 parent
c868ee1
commit d810bb7
Showing
7 changed files
with
1,499 additions
and
0 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,302 @@ | ||
//===- Tokens.h - collect tokens from preprocessing --------------*- C++-*-===// | ||
// | ||
// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. | ||
// See https://llvm.org/LICENSE.txt for license information. | ||
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception | ||
// | ||
//===----------------------------------------------------------------------===// | ||
// Record tokens that a preprocessor emits and define operations to map between | ||
// the tokens written in a file and tokens produced by the preprocessor. | ||
// | ||
// When running the compiler, there are two token streams we are interested in: | ||
// - "spelled" tokens directly correspond to a substring written in some | ||
// source file. | ||
// - "expanded" tokens represent the result of preprocessing, parses consumes | ||
// this token stream to produce the AST. | ||
// | ||
// Expanded tokens correspond directly to locations found in the AST, allowing | ||
// to find subranges of the token stream covered by various AST nodes. Spelled | ||
// tokens correspond directly to the source code written by the user. | ||
// | ||
// To allow composing these two use-cases, we also define operations that map | ||
// between expanded and spelled tokens that produced them (macro calls, | ||
// directives, etc). | ||
// | ||
//===----------------------------------------------------------------------===// | ||
|
||
#ifndef LLVM_CLANG_TOOLING_SYNTAX_TOKENS_H | ||
#define LLVM_CLANG_TOOLING_SYNTAX_TOKENS_H | ||
|
||
#include "clang/Basic/FileManager.h" | ||
#include "clang/Basic/LangOptions.h" | ||
#include "clang/Basic/SourceLocation.h" | ||
#include "clang/Basic/SourceManager.h" | ||
#include "clang/Basic/TokenKinds.h" | ||
#include "clang/Lex/Token.h" | ||
#include "llvm/ADT/ArrayRef.h" | ||
#include "llvm/ADT/Optional.h" | ||
#include "llvm/ADT/StringRef.h" | ||
#include "llvm/Support/Compiler.h" | ||
#include "llvm/Support/raw_ostream.h" | ||
#include <cstdint> | ||
#include <tuple> | ||
|
||
namespace clang { | ||
class Preprocessor; | ||
|
||
namespace syntax { | ||
|
||
/// A half-open character range inside a particular file, the start offset is | ||
/// included and the end offset is excluded from the range. | ||
struct FileRange { | ||
/// EXPECTS: File.isValid() && Begin <= End. | ||
FileRange(FileID File, unsigned BeginOffset, unsigned EndOffset); | ||
/// EXPECTS: BeginLoc.isValid() && BeginLoc.isFileID(). | ||
FileRange(const SourceManager &SM, SourceLocation BeginLoc, unsigned Length); | ||
/// EXPECTS: BeginLoc.isValid() && BeginLoc.isFileID(), Begin <= End and files | ||
/// are the same. | ||
FileRange(const SourceManager &SM, SourceLocation BeginLoc, | ||
SourceLocation EndLoc); | ||
|
||
FileID file() const { return File; } | ||
/// Start is a start offset (inclusive) in the corresponding file. | ||
unsigned beginOffset() const { return Begin; } | ||
/// End offset (exclusive) in the corresponding file. | ||
unsigned endOffset() const { return End; } | ||
|
||
unsigned length() const { return End - Begin; } | ||
|
||
/// Gets the substring that this FileRange refers to. | ||
llvm::StringRef text(const SourceManager &SM) const; | ||
|
||
friend bool operator==(const FileRange &L, const FileRange &R) { | ||
return std::tie(L.File, L.Begin, L.End) == std::tie(R.File, R.Begin, R.End); | ||
} | ||
friend bool operator!=(const FileRange &L, const FileRange &R) { | ||
return !(L == R); | ||
} | ||
|
||
private: | ||
FileID File; | ||
unsigned Begin; | ||
unsigned End; | ||
}; | ||
|
||
/// For debugging purposes. | ||
llvm::raw_ostream &operator<<(llvm::raw_ostream &OS, const FileRange &R); | ||
|
||
/// A token coming directly from a file or from a macro invocation. Has just | ||
/// enough information to locate the token in the source code. | ||
/// Can represent both expanded and spelled tokens. | ||
class Token { | ||
public: | ||
Token(SourceLocation Location, unsigned Length, tok::TokenKind Kind) | ||
: Location(Location), Length(Length), Kind(Kind) {} | ||
/// EXPECTS: clang::Token is not an annotation token. | ||
explicit Token(const clang::Token &T); | ||
|
||
tok::TokenKind kind() const { return Kind; } | ||
/// Location of the first character of a token. | ||
SourceLocation location() const { return Location; } | ||
/// Location right after the last character of a token. | ||
SourceLocation endLocation() const { | ||
return Location.getLocWithOffset(Length); | ||
} | ||
unsigned length() const { return Length; } | ||
|
||
/// Get the substring covered by the token. Note that will include all | ||
/// digraphs, newline continuations, etc. E.g. tokens for 'int' and | ||
/// in\ | ||
/// t | ||
/// both have the same kind tok::kw_int, but results of text() are different. | ||
llvm::StringRef text(const SourceManager &SM) const; | ||
|
||
/// Gets a range of this token. | ||
/// EXPECTS: token comes from a file, not from a macro expansion. | ||
FileRange range(const SourceManager &SM) const; | ||
|
||
/// Given two tokens inside the same file, returns a file range that starts at | ||
/// \p First and ends at \p Last. | ||
/// EXPECTS: First and Last are file tokens from the same file, Last starts | ||
/// after First. | ||
static FileRange range(const SourceManager &SM, const syntax::Token &First, | ||
const syntax::Token &Last); | ||
|
||
std::string dumpForTests(const SourceManager &SM) const; | ||
/// For debugging purposes. | ||
std::string str() const; | ||
|
||
private: | ||
SourceLocation Location; | ||
unsigned Length; | ||
tok::TokenKind Kind; | ||
}; | ||
/// For debugging purposes. Equivalent to a call to Token::str(). | ||
llvm::raw_ostream &operator<<(llvm::raw_ostream &OS, const Token &T); | ||
|
||
/// A list of tokens obtained by preprocessing a text buffer and operations to | ||
/// map between the expanded and spelled tokens, i.e. TokenBuffer has | ||
/// information about two token streams: | ||
/// 1. Expanded tokens: tokens produced by the preprocessor after all macro | ||
/// replacements, | ||
/// 2. Spelled tokens: corresponding directly to the source code of a file | ||
/// before any macro replacements occurred. | ||
/// Here's an example to illustrate a difference between those two: | ||
/// #define FOO 10 | ||
/// int a = FOO; | ||
/// | ||
/// Spelled tokens are {'#','define','FOO','10','int','a','=','FOO',';'}. | ||
/// Expanded tokens are {'int','a','=','10',';','eof'}. | ||
/// | ||
/// Note that the expanded token stream has a tok::eof token at the end, the | ||
/// spelled tokens never store a 'eof' token. | ||
/// | ||
/// The full list expanded tokens can be obtained with expandedTokens(). Spelled | ||
/// tokens for each of the files can be obtained via spelledTokens(FileID). | ||
/// | ||
/// To map between the expanded and spelled tokens use findSpelledByExpanded(). | ||
/// | ||
/// To build a token buffer use the TokenCollector class. You can also compute | ||
/// the spelled tokens of a file using the tokenize() helper. | ||
/// | ||
/// FIXME: allow to map from spelled to expanded tokens when use-case shows up. | ||
class TokenBuffer { | ||
public: | ||
TokenBuffer(const SourceManager &SourceMgr) : SourceMgr(&SourceMgr) {} | ||
/// All tokens produced by the preprocessor after all macro replacements, | ||
/// directives, etc. Source locations found in the clang AST will always | ||
/// point to one of these tokens. | ||
/// FIXME: figure out how to handle token splitting, e.g. '>>' can be split | ||
/// into two '>' tokens by the parser. However, TokenBuffer currently | ||
/// keeps it as a single '>>' token. | ||
llvm::ArrayRef<syntax::Token> expandedTokens() const { | ||
return ExpandedTokens; | ||
} | ||
|
||
/// Find the subrange of spelled tokens that produced the corresponding \p | ||
/// Expanded tokens. | ||
/// | ||
/// EXPECTS: \p Expanded is a subrange of expandedTokens(). | ||
/// | ||
/// Will fail if the expanded tokens do not correspond to a | ||
/// sequence of spelled tokens. E.g. for the following example: | ||
/// | ||
/// #define FIRST f1 f2 f3 | ||
/// #define SECOND s1 s2 s3 | ||
/// | ||
/// a FIRST b SECOND c // expanded tokens are: a f1 f2 f3 b s1 s2 s3 c | ||
/// | ||
/// the results would be: | ||
/// expanded => spelled | ||
/// ------------------------ | ||
/// a => a | ||
/// s1 s2 s3 => SECOND | ||
/// a f1 f2 f3 => a FIRST | ||
/// a f1 => can't map | ||
/// s1 s2 => can't map | ||
/// | ||
/// If \p Expanded is empty, the returned value is llvm::None. | ||
/// Complexity is logarithmic. | ||
llvm::Optional<llvm::ArrayRef<syntax::Token>> | ||
spelledForExpanded(llvm::ArrayRef<syntax::Token> Expanded) const; | ||
|
||
/// Lexed tokens of a file before preprocessing. E.g. for the following input | ||
/// #define DECL(name) int name = 10 | ||
/// DECL(a); | ||
/// spelledTokens() returns {"#", "define", "DECL", "(", "name", ")", "eof"}. | ||
/// FIXME: we do not yet store tokens of directives, like #include, #define, | ||
/// #pragma, etc. | ||
llvm::ArrayRef<syntax::Token> spelledTokens(FileID FID) const; | ||
|
||
std::string dumpForTests() const; | ||
|
||
private: | ||
/// Describes a mapping between a continuous subrange of spelled tokens and | ||
/// expanded tokens. Represents macro expansions, preprocessor directives, | ||
/// conditionally disabled pp regions, etc. | ||
/// #define FOO 1+2 | ||
/// #define BAR(a) a + 1 | ||
/// FOO // invocation #1, tokens = {'1','+','2'}, macroTokens = {'FOO'}. | ||
/// BAR(1) // invocation #2, tokens = {'a', '+', '1'}, | ||
/// macroTokens = {'BAR', '(', '1', ')'}. | ||
struct Mapping { | ||
// Positions in the corresponding spelled token stream. The corresponding | ||
// range is never empty. | ||
unsigned BeginSpelled = 0; | ||
unsigned EndSpelled = 0; | ||
// Positions in the expanded token stream. The corresponding range can be | ||
// empty. | ||
unsigned BeginExpanded = 0; | ||
unsigned EndExpanded = 0; | ||
|
||
/// For debugging purposes. | ||
std::string str() const; | ||
}; | ||
/// Spelled tokens of the file with information about the subranges. | ||
struct MarkedFile { | ||
/// Lexed, but not preprocessed, tokens of the file. These map directly to | ||
/// text in the corresponding files and include tokens of all preprocessor | ||
/// directives. | ||
/// FIXME: spelled tokens don't change across FileID that map to the same | ||
/// FileEntry. We could consider deduplicating them to save memory. | ||
std::vector<syntax::Token> SpelledTokens; | ||
/// A sorted list to convert between the spelled and expanded token streams. | ||
std::vector<Mapping> Mappings; | ||
/// The first expanded token produced for this FileID. | ||
unsigned BeginExpanded = 0; | ||
unsigned EndExpanded = 0; | ||
}; | ||
|
||
friend class TokenCollector; | ||
|
||
/// Maps a single expanded token to its spelled counterpart or a mapping that | ||
/// produced it. | ||
std::pair<const syntax::Token *, const Mapping *> | ||
spelledForExpandedToken(const syntax::Token *Expanded) const; | ||
|
||
/// Token stream produced after preprocessing, conceputally this captures the | ||
/// same stream as 'clang -E' (excluding the preprocessor directives like | ||
/// #file, etc.). | ||
std::vector<syntax::Token> ExpandedTokens; | ||
llvm::DenseMap<FileID, MarkedFile> Files; | ||
// The value is never null, pointer instead of reference to avoid disabling | ||
// implicit assignment operator. | ||
const SourceManager *SourceMgr; | ||
}; | ||
|
||
/// Lex the text buffer, corresponding to \p FID, in raw mode and record the | ||
/// resulting spelled tokens. Does minimal post-processing on raw identifiers, | ||
/// setting the appropriate token kind (instead of the raw_identifier reported | ||
/// by lexer in raw mode). This is a very low-level function, most users should | ||
/// prefer to use TokenCollector. Lexing in raw mode produces wildly different | ||
/// results from what one might expect when running a C++ frontend, e.g. | ||
/// preprocessor does not run at all. | ||
/// The result will *not* have a 'eof' token at the end. | ||
std::vector<syntax::Token> tokenize(FileID FID, const SourceManager &SM, | ||
const LangOptions &LO); | ||
|
||
/// Collects tokens for the main file while running the frontend action. An | ||
/// instance of this object should be created on | ||
/// FrontendAction::BeginSourceFile() and the results should be consumed after | ||
/// FrontendAction::Execute() finishes. | ||
class TokenCollector { | ||
public: | ||
/// Adds the hooks to collect the tokens. Should be called before the | ||
/// preprocessing starts, i.e. as a part of BeginSourceFile() or | ||
/// CreateASTConsumer(). | ||
TokenCollector(Preprocessor &P); | ||
|
||
/// Finalizes token collection. Should be called after preprocessing is | ||
/// finished, i.e. after running Execute(). | ||
LLVM_NODISCARD TokenBuffer consume() &&; | ||
|
||
private: | ||
class Builder; | ||
std::vector<syntax::Token> Expanded; | ||
Preprocessor &PP; | ||
}; | ||
|
||
} // namespace syntax | ||
} // namespace clang | ||
|
||
#endif |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,10 @@ | ||
set(LLVM_LINK_COMPONENTS Support) | ||
|
||
add_clang_library(clangToolingSyntax | ||
Tokens.cpp | ||
|
||
LINK_LIBS | ||
clangBasic | ||
clangFrontend | ||
clangLex | ||
) |
Oops, something went wrong.